DataCamp offer interactive courses related to R Programming. While some is review, it is helpful to see other perspectives on material. As well, DataCamp has some interesting materials on packages that I want to learn better (ggplot2, dplyr, ggvis, etc.). This document summarizes a few key insights from:
The original DataCamp_Insights_v001 document has been split for this document:
There are a few nuggest from within these beginning modules, including:
Below is some sample code showing examples for the generic statements:
# Factors
xRaw = c("High", "High", "Low", "Low", "Medium", "Very High", "Low")
xFactorNon = factor(xRaw, levels=c("Low", "Medium", "High", "Very High"))
xFactorNon
## [1] High High Low Low Medium Very High Low
## Levels: Low Medium High Very High
xFactorNon[xFactorNon == "High"] > xFactorNon[xFactorNon == "Low"][1]
## Warning in Ops.factor(xFactorNon[xFactorNon == "High"],
## xFactorNon[xFactorNon == : '>' not meaningful for factors
## [1] NA NA
xFactorOrder = factor(xRaw, ordered=TRUE, levels=c("Low", "Medium", "High", "Very High"))
xFactorOrder
## [1] High High Low Low Medium Very High Low
## Levels: Low < Medium < High < Very High
xFactorOrder[xFactorOrder == "High"] > xFactorOrder[xFactorOrder == "Low"][1]
## [1] TRUE TRUE
# Subsets
data(mtcars)
subset(mtcars, mpg>=25)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
identical(subset(mtcars, mpg>=25), mtcars[mtcars$mpg>=25, ])
## [1] TRUE
subset(mtcars, mpg>25, select=c("mpg", "cyl", "disp"))
## mpg cyl disp
## Fiat 128 32.4 4 78.7
## Honda Civic 30.4 4 75.7
## Toyota Corolla 33.9 4 71.1
## Fiat X1-9 27.3 4 79.0
## Porsche 914-2 26.0 4 120.3
## Lotus Europa 30.4 4 95.1
# & and && (same as | and ||)
compA <- c(2, 3, 4, 1, 2, 3)
compB <- c(1, 2, 3, 4, 5, 6)
(compA > compB) & (compA + compB < 6)
## [1] TRUE TRUE FALSE FALSE FALSE FALSE
(compA > compB) | (compA + compB < 6)
## [1] TRUE TRUE TRUE TRUE FALSE FALSE
(compA > compB) && (compA + compB < 6)
## [1] TRUE
(compA > compB) || (compA + compB < 6)
## [1] TRUE
# Loops and cat()
# for (a in b) {
# do stuff
# if (exitCond) { break }
# if (nextCond) { next }
# do some more stuff
# }
for (myVal in compA*compB) {
print(paste0("myVal is: ", myVal))
if ((myVal %% 3) == 0) { cat("Divisible by 3, not happy about that\n\n"); next }
print("That is not divisible by 3")
if ((myVal %% 5) == 0) { cat("Exiting due to divisible by 5 but not divisible by 3\n\n"); break }
cat("Onwards and upwards\n\n")
}
## [1] "myVal is: 2"
## [1] "That is not divisible by 3"
## Onwards and upwards
##
## [1] "myVal is: 6"
## Divisible by 3, not happy about that
##
## [1] "myVal is: 12"
## Divisible by 3, not happy about that
##
## [1] "myVal is: 4"
## [1] "That is not divisible by 3"
## Onwards and upwards
##
## [1] "myVal is: 10"
## [1] "That is not divisible by 3"
## Exiting due to divisible by 5 but not divisible by 3
# args() and search()
args(plot.default)
## function (x, y = NULL, type = "p", xlim = NULL, ylim = NULL,
## log = "", main = NULL, sub = NULL, xlab = NULL, ylab = NULL,
## ann = par("ann"), axes = TRUE, frame.plot = axes, panel.first = NULL,
## panel.last = NULL, asp = NA, ...)
## NULL
search()
## [1] ".GlobalEnv" "package:stats" "package:graphics"
## [4] "package:grDevices" "package:utils" "package:datasets"
## [7] "package:methods" "Autoloads" "package:base"
# unique()
compA
## [1] 2 3 4 1 2 3
unique(compA)
## [1] 2 3 4 1
# unlist()
listA <- as.list(compA)
unlist(listA)
## [1] 2 3 4 1 2 3
identical(compA, unlist(listA))
## [1] TRUE
# sort()
sort(mtcars$mpg)
## [1] 10.4 10.4 13.3 14.3 14.7 15.0 15.2 15.2 15.5 15.8 16.4 17.3 17.8 18.1
## [15] 18.7 19.2 19.2 19.7 21.0 21.0 21.4 21.4 21.5 22.8 22.8 24.4 26.0 27.3
## [29] 30.4 30.4 32.4 33.9
sort(mtcars$mpg, decreasing=TRUE)
## [1] 33.9 32.4 30.4 30.4 27.3 26.0 24.4 22.8 22.8 21.5 21.4 21.4 21.0 21.0
## [15] 19.7 19.2 19.2 18.7 18.1 17.8 17.3 16.4 15.8 15.5 15.2 15.2 15.0 14.7
## [29] 14.3 13.3 10.4 10.4
# rep()
rep(1:6, times=2) # 1:6 followed by 1:6
## [1] 1 2 3 4 5 6 1 2 3 4 5 6
rep(1:6, each=2) # 1 1 2 2 3 3 4 4 5 5 6 6
## [1] 1 1 2 2 3 3 4 4 5 5 6 6
rep(1:6, times=2, each=3) # 1 1 1 2 2 2 3 3 3 4 4 4 5 5 5 6 6 6 repeated twice (each comes first)
## [1] 1 1 1 2 2 2 3 3 3 4 4 4 5 5 5 6 6 6 1 1 1 2 2 2 3 3 3 4 4 4 5 5 5 6 6
## [36] 6
rep(1:6, times=6:1) # 1 1 1 1 1 1 2 2 2 2 2 3 3 3 3 4 4 4 5 5 6
## [1] 1 1 1 1 1 1 2 2 2 2 2 3 3 3 3 4 4 4 5 5 6
# append()
myWords <- c("The", "cat", "in", "the", "hat")
paste(append(myWords, c("is", "fun", "to", "read")), collapse=" ")
## [1] "The cat in the hat is fun to read"
paste(append(myWords, "funny", 4), collapse=" ")
## [1] "The cat in the funny hat"
# grep("//1")
sampMsg <- "This is from myname@subdomain.mydomain.com again"
gsub("(^.*\\w*[a-zA-Z0-9]+@)([a-zA-Z0-9]+\\.[a-zA-Z0-9.]+)(.*$)", "\\1", sampMsg)
## [1] "This is from myname@"
gsub("(^.*\\w*[a-zA-Z0-9]+@)([a-zA-Z0-9]+\\.[a-zA-Z0-9.]+)(.*$)", "\\2", sampMsg)
## [1] "subdomain.mydomain.com"
gsub("(^.*\\w*[a-zA-Z0-9]+@)([a-zA-Z0-9]+\\.[a-zA-Z0-9.]+)(.*$)", "\\3", sampMsg)
## [1] " again"
# rev()
compA
## [1] 2 3 4 1 2 3
rev(compA)
## [1] 3 2 1 4 3 2
Below is some sample code showing examples for the apply statements:
# lapply
args(lapply)
## function (X, FUN, ...)
## NULL
lapply(1:5, FUN=sqrt)
## [[1]]
## [1] 1
##
## [[2]]
## [1] 1.414214
##
## [[3]]
## [1] 1.732051
##
## [[4]]
## [1] 2
##
## [[5]]
## [1] 2.236068
lapply(1:5, FUN=function(x, y=2) { c(x=x, y=y, pow=x^y) }, y=3)
## [[1]]
## x y pow
## 1 3 1
##
## [[2]]
## x y pow
## 2 3 8
##
## [[3]]
## x y pow
## 3 3 27
##
## [[4]]
## x y pow
## 4 3 64
##
## [[5]]
## x y pow
## 5 3 125
lapply(1:5, FUN=function(x, y=2) { if (x <= 3) {c(x=x, y=y, pow=x^y) } else { c(pow=x^y) } }, y=3)
## [[1]]
## x y pow
## 1 3 1
##
## [[2]]
## x y pow
## 2 3 8
##
## [[3]]
## x y pow
## 3 3 27
##
## [[4]]
## pow
## 64
##
## [[5]]
## pow
## 125
# sapply (defaults to returning a named vector/array if possible; is lapply otherwise)
args(sapply)
## function (X, FUN, ..., simplify = TRUE, USE.NAMES = TRUE)
## NULL
args(simplify2array)
## function (x, higher = TRUE)
## NULL
sapply(1:5, FUN=sqrt)
## [1] 1.000000 1.414214 1.732051 2.000000 2.236068
sapply(1:5, FUN=function(x, y=2) { c(x=x, y=y, pow=x^y) }, y=3)
## [,1] [,2] [,3] [,4] [,5]
## x 1 2 3 4 5
## y 3 3 3 3 3
## pow 1 8 27 64 125
sapply(1:5, FUN=function(x, y=2) { if (x <= 3) {c(x=x, y=y, pow=x^y) } else { c(pow=x^y) } }, y=3)
## [[1]]
## x y pow
## 1 3 1
##
## [[2]]
## x y pow
## 2 3 8
##
## [[3]]
## x y pow
## 3 3 27
##
## [[4]]
## pow
## 64
##
## [[5]]
## pow
## 125
# vapply (tells sapply exactly what should be returned; errors out otherwise)
args(vapply)
## function (X, FUN, FUN.VALUE, ..., USE.NAMES = TRUE)
## NULL
vapply(1:5, FUN=sqrt, FUN.VALUE=numeric(1))
## [1] 1.000000 1.414214 1.732051 2.000000 2.236068
vapply(1:5, FUN=function(x, y=2) { c(x=x, y=y, pow=x^y) }, FUN.VALUE=numeric(3), y=3)
## [,1] [,2] [,3] [,4] [,5]
## x 1 2 3 4 5
## y 3 3 3 3 3
## pow 1 8 27 64 125
Below is some sample code for handing dates and times in R:
Sys.Date()
## [1] "2017-03-20"
Sys.time()
## [1] "2017-03-20 07:22:19 CDT"
args(strptime)
## function (x, format, tz = "")
## NULL
rightNow <- as.POSIXct(Sys.time())
format(rightNow, "%Y**%M-%d %H hours and %M minutes", usetz=TRUE)
## [1] "2017**22-20 07 hours and 22 minutes CDT"
lastChristmasNoon <- as.POSIXct("2015-12-25 12:00:00", format="%Y-%m-%d %X")
rightNow - lastChristmasNoon
## Time difference of 450.7655 days
nextUMHomeGame <- as.POSIXct("16/SEP/3 12:00:00", format="%y/%b/%d %H:%M:%S", tz="America/Detroit")
nextUMHomeGame - rightNow
## Time difference of -197.8488 days
# Time zones available in R
OlsonNames()
## [1] "Africa/Abidjan" "Africa/Accra"
## [3] "Africa/Addis_Ababa" "Africa/Algiers"
## [5] "Africa/Asmara" "Africa/Asmera"
## [7] "Africa/Bamako" "Africa/Bangui"
## [9] "Africa/Banjul" "Africa/Bissau"
## [11] "Africa/Blantyre" "Africa/Brazzaville"
## [13] "Africa/Bujumbura" "Africa/Cairo"
## [15] "Africa/Casablanca" "Africa/Ceuta"
## [17] "Africa/Conakry" "Africa/Dakar"
## [19] "Africa/Dar_es_Salaam" "Africa/Djibouti"
## [21] "Africa/Douala" "Africa/El_Aaiun"
## [23] "Africa/Freetown" "Africa/Gaborone"
## [25] "Africa/Harare" "Africa/Johannesburg"
## [27] "Africa/Juba" "Africa/Kampala"
## [29] "Africa/Khartoum" "Africa/Kigali"
## [31] "Africa/Kinshasa" "Africa/Lagos"
## [33] "Africa/Libreville" "Africa/Lome"
## [35] "Africa/Luanda" "Africa/Lubumbashi"
## [37] "Africa/Lusaka" "Africa/Malabo"
## [39] "Africa/Maputo" "Africa/Maseru"
## [41] "Africa/Mbabane" "Africa/Mogadishu"
## [43] "Africa/Monrovia" "Africa/Nairobi"
## [45] "Africa/Ndjamena" "Africa/Niamey"
## [47] "Africa/Nouakchott" "Africa/Ouagadougou"
## [49] "Africa/Porto-Novo" "Africa/Sao_Tome"
## [51] "Africa/Timbuktu" "Africa/Tripoli"
## [53] "Africa/Tunis" "Africa/Windhoek"
## [55] "America/Adak" "America/Anchorage"
## [57] "America/Anguilla" "America/Antigua"
## [59] "America/Araguaina" "America/Argentina/Buenos_Aires"
## [61] "America/Argentina/Catamarca" "America/Argentina/ComodRivadavia"
## [63] "America/Argentina/Cordoba" "America/Argentina/Jujuy"
## [65] "America/Argentina/La_Rioja" "America/Argentina/Mendoza"
## [67] "America/Argentina/Rio_Gallegos" "America/Argentina/Salta"
## [69] "America/Argentina/San_Juan" "America/Argentina/San_Luis"
## [71] "America/Argentina/Tucuman" "America/Argentina/Ushuaia"
## [73] "America/Aruba" "America/Asuncion"
## [75] "America/Atikokan" "America/Atka"
## [77] "America/Bahia" "America/Bahia_Banderas"
## [79] "America/Barbados" "America/Belem"
## [81] "America/Belize" "America/Blanc-Sablon"
## [83] "America/Boa_Vista" "America/Bogota"
## [85] "America/Boise" "America/Buenos_Aires"
## [87] "America/Cambridge_Bay" "America/Campo_Grande"
## [89] "America/Cancun" "America/Caracas"
## [91] "America/Catamarca" "America/Cayenne"
## [93] "America/Cayman" "America/Chicago"
## [95] "America/Chihuahua" "America/Coral_Harbour"
## [97] "America/Cordoba" "America/Costa_Rica"
## [99] "America/Creston" "America/Cuiaba"
## [101] "America/Curacao" "America/Danmarkshavn"
## [103] "America/Dawson" "America/Dawson_Creek"
## [105] "America/Denver" "America/Detroit"
## [107] "America/Dominica" "America/Edmonton"
## [109] "America/Eirunepe" "America/El_Salvador"
## [111] "America/Ensenada" "America/Fort_Nelson"
## [113] "America/Fort_Wayne" "America/Fortaleza"
## [115] "America/Glace_Bay" "America/Godthab"
## [117] "America/Goose_Bay" "America/Grand_Turk"
## [119] "America/Grenada" "America/Guadeloupe"
## [121] "America/Guatemala" "America/Guayaquil"
## [123] "America/Guyana" "America/Halifax"
## [125] "America/Havana" "America/Hermosillo"
## [127] "America/Indiana/Indianapolis" "America/Indiana/Knox"
## [129] "America/Indiana/Marengo" "America/Indiana/Petersburg"
## [131] "America/Indiana/Tell_City" "America/Indiana/Vevay"
## [133] "America/Indiana/Vincennes" "America/Indiana/Winamac"
## [135] "America/Indianapolis" "America/Inuvik"
## [137] "America/Iqaluit" "America/Jamaica"
## [139] "America/Jujuy" "America/Juneau"
## [141] "America/Kentucky/Louisville" "America/Kentucky/Monticello"
## [143] "America/Knox_IN" "America/Kralendijk"
## [145] "America/La_Paz" "America/Lima"
## [147] "America/Los_Angeles" "America/Louisville"
## [149] "America/Lower_Princes" "America/Maceio"
## [151] "America/Managua" "America/Manaus"
## [153] "America/Marigot" "America/Martinique"
## [155] "America/Matamoros" "America/Mazatlan"
## [157] "America/Mendoza" "America/Menominee"
## [159] "America/Merida" "America/Metlakatla"
## [161] "America/Mexico_City" "America/Miquelon"
## [163] "America/Moncton" "America/Monterrey"
## [165] "America/Montevideo" "America/Montreal"
## [167] "America/Montserrat" "America/Nassau"
## [169] "America/New_York" "America/Nipigon"
## [171] "America/Nome" "America/Noronha"
## [173] "America/North_Dakota/Beulah" "America/North_Dakota/Center"
## [175] "America/North_Dakota/New_Salem" "America/Ojinaga"
## [177] "America/Panama" "America/Pangnirtung"
## [179] "America/Paramaribo" "America/Phoenix"
## [181] "America/Port-au-Prince" "America/Port_of_Spain"
## [183] "America/Porto_Acre" "America/Porto_Velho"
## [185] "America/Puerto_Rico" "America/Rainy_River"
## [187] "America/Rankin_Inlet" "America/Recife"
## [189] "America/Regina" "America/Resolute"
## [191] "America/Rio_Branco" "America/Rosario"
## [193] "America/Santa_Isabel" "America/Santarem"
## [195] "America/Santiago" "America/Santo_Domingo"
## [197] "America/Sao_Paulo" "America/Scoresbysund"
## [199] "America/Shiprock" "America/Sitka"
## [201] "America/St_Barthelemy" "America/St_Johns"
## [203] "America/St_Kitts" "America/St_Lucia"
## [205] "America/St_Thomas" "America/St_Vincent"
## [207] "America/Swift_Current" "America/Tegucigalpa"
## [209] "America/Thule" "America/Thunder_Bay"
## [211] "America/Tijuana" "America/Toronto"
## [213] "America/Tortola" "America/Vancouver"
## [215] "America/Virgin" "America/Whitehorse"
## [217] "America/Winnipeg" "America/Yakutat"
## [219] "America/Yellowknife" "Antarctica/Casey"
## [221] "Antarctica/Davis" "Antarctica/DumontDUrville"
## [223] "Antarctica/Macquarie" "Antarctica/Mawson"
## [225] "Antarctica/McMurdo" "Antarctica/Palmer"
## [227] "Antarctica/Rothera" "Antarctica/South_Pole"
## [229] "Antarctica/Syowa" "Antarctica/Troll"
## [231] "Antarctica/Vostok" "Arctic/Longyearbyen"
## [233] "Asia/Aden" "Asia/Almaty"
## [235] "Asia/Amman" "Asia/Anadyr"
## [237] "Asia/Aqtau" "Asia/Aqtobe"
## [239] "Asia/Ashgabat" "Asia/Ashkhabad"
## [241] "Asia/Baghdad" "Asia/Bahrain"
## [243] "Asia/Baku" "Asia/Bangkok"
## [245] "Asia/Beirut" "Asia/Bishkek"
## [247] "Asia/Brunei" "Asia/Calcutta"
## [249] "Asia/Chita" "Asia/Choibalsan"
## [251] "Asia/Chongqing" "Asia/Chungking"
## [253] "Asia/Colombo" "Asia/Dacca"
## [255] "Asia/Damascus" "Asia/Dhaka"
## [257] "Asia/Dili" "Asia/Dubai"
## [259] "Asia/Dushanbe" "Asia/Gaza"
## [261] "Asia/Harbin" "Asia/Hebron"
## [263] "Asia/Ho_Chi_Minh" "Asia/Hong_Kong"
## [265] "Asia/Hovd" "Asia/Irkutsk"
## [267] "Asia/Istanbul" "Asia/Jakarta"
## [269] "Asia/Jayapura" "Asia/Jerusalem"
## [271] "Asia/Kabul" "Asia/Kamchatka"
## [273] "Asia/Karachi" "Asia/Kashgar"
## [275] "Asia/Kathmandu" "Asia/Katmandu"
## [277] "Asia/Khandyga" "Asia/Kolkata"
## [279] "Asia/Krasnoyarsk" "Asia/Kuala_Lumpur"
## [281] "Asia/Kuching" "Asia/Kuwait"
## [283] "Asia/Macao" "Asia/Macau"
## [285] "Asia/Magadan" "Asia/Makassar"
## [287] "Asia/Manila" "Asia/Muscat"
## [289] "Asia/Nicosia" "Asia/Novokuznetsk"
## [291] "Asia/Novosibirsk" "Asia/Omsk"
## [293] "Asia/Oral" "Asia/Phnom_Penh"
## [295] "Asia/Pontianak" "Asia/Pyongyang"
## [297] "Asia/Qatar" "Asia/Qyzylorda"
## [299] "Asia/Rangoon" "Asia/Riyadh"
## [301] "Asia/Saigon" "Asia/Sakhalin"
## [303] "Asia/Samarkand" "Asia/Seoul"
## [305] "Asia/Shanghai" "Asia/Singapore"
## [307] "Asia/Srednekolymsk" "Asia/Taipei"
## [309] "Asia/Tashkent" "Asia/Tbilisi"
## [311] "Asia/Tehran" "Asia/Tel_Aviv"
## [313] "Asia/Thimbu" "Asia/Thimphu"
## [315] "Asia/Tokyo" "Asia/Ujung_Pandang"
## [317] "Asia/Ulaanbaatar" "Asia/Ulan_Bator"
## [319] "Asia/Urumqi" "Asia/Ust-Nera"
## [321] "Asia/Vientiane" "Asia/Vladivostok"
## [323] "Asia/Yakutsk" "Asia/Yekaterinburg"
## [325] "Asia/Yerevan" "Atlantic/Azores"
## [327] "Atlantic/Bermuda" "Atlantic/Canary"
## [329] "Atlantic/Cape_Verde" "Atlantic/Faeroe"
## [331] "Atlantic/Faroe" "Atlantic/Jan_Mayen"
## [333] "Atlantic/Madeira" "Atlantic/Reykjavik"
## [335] "Atlantic/South_Georgia" "Atlantic/St_Helena"
## [337] "Atlantic/Stanley" "Australia/ACT"
## [339] "Australia/Adelaide" "Australia/Brisbane"
## [341] "Australia/Broken_Hill" "Australia/Canberra"
## [343] "Australia/Currie" "Australia/Darwin"
## [345] "Australia/Eucla" "Australia/Hobart"
## [347] "Australia/LHI" "Australia/Lindeman"
## [349] "Australia/Lord_Howe" "Australia/Melbourne"
## [351] "Australia/North" "Australia/NSW"
## [353] "Australia/Perth" "Australia/Queensland"
## [355] "Australia/South" "Australia/Sydney"
## [357] "Australia/Tasmania" "Australia/Victoria"
## [359] "Australia/West" "Australia/Yancowinna"
## [361] "Brazil/Acre" "Brazil/DeNoronha"
## [363] "Brazil/East" "Brazil/West"
## [365] "Canada/Atlantic" "Canada/Central"
## [367] "Canada/East-Saskatchewan" "Canada/Eastern"
## [369] "Canada/Mountain" "Canada/Newfoundland"
## [371] "Canada/Pacific" "Canada/Saskatchewan"
## [373] "Canada/Yukon" "CET"
## [375] "Chile/Continental" "Chile/EasterIsland"
## [377] "CST6CDT" "Cuba"
## [379] "EET" "Egypt"
## [381] "Eire" "EST"
## [383] "EST5EDT" "Etc/GMT"
## [385] "Etc/GMT-0" "Etc/GMT-1"
## [387] "Etc/GMT-10" "Etc/GMT-11"
## [389] "Etc/GMT-12" "Etc/GMT-13"
## [391] "Etc/GMT-14" "Etc/GMT-2"
## [393] "Etc/GMT-3" "Etc/GMT-4"
## [395] "Etc/GMT-5" "Etc/GMT-6"
## [397] "Etc/GMT-7" "Etc/GMT-8"
## [399] "Etc/GMT-9" "Etc/GMT+0"
## [401] "Etc/GMT+1" "Etc/GMT+10"
## [403] "Etc/GMT+11" "Etc/GMT+12"
## [405] "Etc/GMT+2" "Etc/GMT+3"
## [407] "Etc/GMT+4" "Etc/GMT+5"
## [409] "Etc/GMT+6" "Etc/GMT+7"
## [411] "Etc/GMT+8" "Etc/GMT+9"
## [413] "Etc/GMT0" "Etc/Greenwich"
## [415] "Etc/UCT" "Etc/Universal"
## [417] "Etc/UTC" "Etc/Zulu"
## [419] "Europe/Amsterdam" "Europe/Andorra"
## [421] "Europe/Athens" "Europe/Belfast"
## [423] "Europe/Belgrade" "Europe/Berlin"
## [425] "Europe/Bratislava" "Europe/Brussels"
## [427] "Europe/Bucharest" "Europe/Budapest"
## [429] "Europe/Busingen" "Europe/Chisinau"
## [431] "Europe/Copenhagen" "Europe/Dublin"
## [433] "Europe/Gibraltar" "Europe/Guernsey"
## [435] "Europe/Helsinki" "Europe/Isle_of_Man"
## [437] "Europe/Istanbul" "Europe/Jersey"
## [439] "Europe/Kaliningrad" "Europe/Kiev"
## [441] "Europe/Lisbon" "Europe/Ljubljana"
## [443] "Europe/London" "Europe/Luxembourg"
## [445] "Europe/Madrid" "Europe/Malta"
## [447] "Europe/Mariehamn" "Europe/Minsk"
## [449] "Europe/Monaco" "Europe/Moscow"
## [451] "Europe/Nicosia" "Europe/Oslo"
## [453] "Europe/Paris" "Europe/Podgorica"
## [455] "Europe/Prague" "Europe/Riga"
## [457] "Europe/Rome" "Europe/Samara"
## [459] "Europe/San_Marino" "Europe/Sarajevo"
## [461] "Europe/Simferopol" "Europe/Skopje"
## [463] "Europe/Sofia" "Europe/Stockholm"
## [465] "Europe/Tallinn" "Europe/Tirane"
## [467] "Europe/Tiraspol" "Europe/Uzhgorod"
## [469] "Europe/Vaduz" "Europe/Vatican"
## [471] "Europe/Vienna" "Europe/Vilnius"
## [473] "Europe/Volgograd" "Europe/Warsaw"
## [475] "Europe/Zagreb" "Europe/Zaporozhye"
## [477] "Europe/Zurich" "GB"
## [479] "GB-Eire" "GMT"
## [481] "GMT-0" "GMT+0"
## [483] "GMT0" "Greenwich"
## [485] "Hongkong" "HST"
## [487] "Iceland" "Indian/Antananarivo"
## [489] "Indian/Chagos" "Indian/Christmas"
## [491] "Indian/Cocos" "Indian/Comoro"
## [493] "Indian/Kerguelen" "Indian/Mahe"
## [495] "Indian/Maldives" "Indian/Mauritius"
## [497] "Indian/Mayotte" "Indian/Reunion"
## [499] "Iran" "Israel"
## [501] "Jamaica" "Japan"
## [503] "Kwajalein" "Libya"
## [505] "MET" "Mexico/BajaNorte"
## [507] "Mexico/BajaSur" "Mexico/General"
## [509] "MST" "MST7MDT"
## [511] "Navajo" "NZ"
## [513] "NZ-CHAT" "Pacific/Apia"
## [515] "Pacific/Auckland" "Pacific/Bougainville"
## [517] "Pacific/Chatham" "Pacific/Chuuk"
## [519] "Pacific/Easter" "Pacific/Efate"
## [521] "Pacific/Enderbury" "Pacific/Fakaofo"
## [523] "Pacific/Fiji" "Pacific/Funafuti"
## [525] "Pacific/Galapagos" "Pacific/Gambier"
## [527] "Pacific/Guadalcanal" "Pacific/Guam"
## [529] "Pacific/Honolulu" "Pacific/Johnston"
## [531] "Pacific/Kiritimati" "Pacific/Kosrae"
## [533] "Pacific/Kwajalein" "Pacific/Majuro"
## [535] "Pacific/Marquesas" "Pacific/Midway"
## [537] "Pacific/Nauru" "Pacific/Niue"
## [539] "Pacific/Norfolk" "Pacific/Noumea"
## [541] "Pacific/Pago_Pago" "Pacific/Palau"
## [543] "Pacific/Pitcairn" "Pacific/Pohnpei"
## [545] "Pacific/Ponape" "Pacific/Port_Moresby"
## [547] "Pacific/Rarotonga" "Pacific/Saipan"
## [549] "Pacific/Samoa" "Pacific/Tahiti"
## [551] "Pacific/Tarawa" "Pacific/Tongatapu"
## [553] "Pacific/Truk" "Pacific/Wake"
## [555] "Pacific/Wallis" "Pacific/Yap"
## [557] "Poland" "Portugal"
## [559] "PRC" "PST8PDT"
## [561] "ROC" "ROK"
## [563] "Singapore" "Turkey"
## [565] "UCT" "Universal"
## [567] "US/Alaska" "US/Aleutian"
## [569] "US/Arizona" "US/Central"
## [571] "US/East-Indiana" "US/Eastern"
## [573] "US/Hawaii" "US/Indiana-Starke"
## [575] "US/Michigan" "US/Mountain"
## [577] "US/Pacific" "US/Pacific-New"
## [579] "US/Samoa" "UTC"
## [581] "VERSION" "W-SU"
## [583] "WET" "Zulu"
# From ?strptime (excerpted)
#
# ** General formats **
# %c Date and time. Locale-specific on output, "%a %b %e %H:%M:%S %Y" on input.
# %F Equivalent to %Y-%m-%d (the ISO 8601 date format).
# %T Equivalent to %H:%M:%S.
# %D Date format such as %m/%d/%y: the C99 standard says it should be that exact format
# %x Date. Locale-specific on output, "%y/%m/%d" on input.
# %X Time. Locale-specific on output, "%H:%M:%S" on input.
#
# ** Key Components **
# %y Year without century (00-99). On input, values 00 to 68 are prefixed by 20 and 69 to 99 by 19
# %Y Year with century
# %m Month as decimal number (01-12).
# %b Abbreviated month name in the current locale on this platform.
# %B Full month name in the current locale.
# %d Day of the month as decimal number (01-31).
# %e Day of the month as decimal number (1-31), with a leading space for a single-digit number.
# %a Abbreviated weekday name in the current locale on this platform.
# %A Full weekday name in the current locale.
# %H Hours as decimal number (00-23)
# %I Hours as decimal number (01-12)
# %M Minute as decimal number (00-59).
# %S Second as integer (00-61), allowing for up to two leap-seconds (but POSIX-compliant implementations will ignore leap seconds).
#
# ** Additional Options **
# %C Century (00-99): the integer part of the year divided by 100.
#
# %g The last two digits of the week-based year (see %V). (Accepted but ignored on input.)
# %G The week-based year (see %V) as a decimal number. (Accepted but ignored on input.)
#
# %h Equivalent to %b.
#
# %j Day of year as decimal number (001-366).
#
# %n Newline on output, arbitrary whitespace on input.
#
# %p AM/PM indicator in the locale. Used in conjunction with %I and not with %H. An empty string in some locales (and the behaviour is undefined if used for input in such a locale). Some platforms accept %P for output, which uses a lower-case version: others will output P.
#
# %r The 12-hour clock time (using the locale's AM or PM). Only defined in some locales.
#
# %R Equivalent to %H:%M.
#
# %t Tab on output, arbitrary whitespace on input.
#
# %u Weekday as a decimal number (1-7, Monday is 1).
#
# %U Week of the year as decimal number (00-53) using Sunday as the first day 1 of the week (and typically with the first Sunday of the year as day 1 of week 1). The US convention.
#
# %V Week of the year as decimal number (01-53) as defined in ISO 8601. If the week (starting on Monday) containing 1 January has four or more days in the new year, then it is considered week 1. Otherwise, it is the last week of the previous year, and the next week is week 1. (Accepted but ignored on input.)
#
# %w Weekday as decimal number (0-6, Sunday is 0).
#
# %W Week of the year as decimal number (00-53) using Monday as the first day of week (and typically with the first Monday of the year as day 1 of week 1). The UK convention.
#
# For input, only years 0:9999 are accepted.
#
# %z Signed offset in hours and minutes from UTC, so -0800 is 8 hours behind UTC. Values up to +1400 are accepted as from R 3.1.1: previous versions only accepted up to +1200. (Standard only for output.)
#
# %Z (Output only.) Time zone abbreviation as a character string (empty if not available). This may not be reliable when a time zone has changed abbreviations over the years.
Additionally, code from several practice examples is added:
set.seed(1608221310)
me <- 89
other_199 <- round(rnorm(199, mean=75.45, sd=11.03), 0)
mean(other_199)
## [1] 75.17588
sd(other_199)
## [1] 11.37711
desMeans <- c(72.275, 76.24, 74.5, 77.695)
desSD <- c(12.31, 11.22, 12.5, 12.53)
prevData <- c(rnorm(200, mean=72.275, sd=12.31),
rnorm(200, mean=76.24, sd=11.22),
rnorm(200, mean=74.5, sd=12.5),
rnorm(200, mean=77.695, sd=12.53)
)
previous_4 <- matrix(data=prevData, ncol=4)
curMeans <- apply(previous_4, 2, FUN=mean)
curSD <- apply(previous_4, 2, FUN=sd)
previous_4 <- t(apply(previous_4, 1, FUN=function(x) { desMeans + (desSD / curSD) * (x - curMeans) } ))
apply(round(previous_4, 0), 2, FUN=mean)
## [1] 72.285 76.245 74.505 77.665
apply(round(previous_4, 0), 2, FUN=sd)
## [1] 12.35097 11.19202 12.49643 12.51744
previous_4 <- round(previous_4, 0)
# Merge me and other_199: my_class
my_class <- c(me, other_199)
# cbind() my_class and previous_4: last_5
last_5 <- cbind(my_class, previous_4)
# Name last_5 appropriately
nms <- paste0("year_", 1:5)
colnames(last_5) <- nms
# Build histogram of my_class
hist(my_class)
# Generate summary of last_5
summary(last_5)
## year_1 year_2 year_3 year_4
## Min. : 46.00 Min. : 43.00 Min. : 38.00 Min. : 42.00
## 1st Qu.: 68.00 1st Qu.: 63.75 1st Qu.: 69.00 1st Qu.: 65.75
## Median : 75.50 Median : 73.00 Median : 76.50 Median : 74.00
## Mean : 75.25 Mean : 72.28 Mean : 76.25 Mean : 74.50
## 3rd Qu.: 83.25 3rd Qu.: 81.00 3rd Qu.: 84.25 3rd Qu.: 82.25
## Max. :108.00 Max. :108.00 Max. :102.00 Max. :113.00
## year_5
## Min. : 38.00
## 1st Qu.: 71.00
## Median : 78.00
## Mean : 77.67
## 3rd Qu.: 86.00
## Max. :117.00
# Build boxplot of last_5
boxplot(last_5)
# How many grades in your class are higher than 75?
sum(my_class > 75)
## [1] 100
# How many students in your class scored strictly higher than you?
sum(my_class > me)
## [1] 17
# What's the proportion of grades below or equal to 64 in the last 5 years?
mean(last_5 <= 64)
## [1] 0.191
# Is your grade greater than 87 and smaller than or equal to 89?
me > 87 & me <= 89
## [1] TRUE
# Which grades in your class are below 60 or above 90?
my_class < 60 | my_class > 90
## [1] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## [23] TRUE FALSE TRUE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE
## [34] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [45] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [56] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [67] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE FALSE
## [78] FALSE TRUE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE
## [89] TRUE FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE TRUE FALSE
## [100] FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [111] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [122] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE FALSE FALSE
## [133] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [144] TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [155] FALSE TRUE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## [166] FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
## [177] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [188] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## [199] FALSE FALSE
# What's the proportion of grades in your class that is average?
mean(my_class >= 70 & my_class <= 85)
## [1] 0.525
# How many students in the last 5 years had a grade of 80 or 90?
sum(last_5 %in% c(80, 90))
## [1] 44
# Define n_smart
n_smart <- sum(my_class >= 80)
# Code the if-else construct
if (n_smart > 50) {
print("smart class")
} else {
print("rather average")
}
## [1] "smart class"
# Define prop_less
prop_less <- mean(my_class < me)
# Code the control construct
if (prop_less > 0.9) {
print("you're among the best 10 percent")
} else if (prop_less > 0.8) {
print("you're among the best 20 percent")
} else {
print("need more analysis")
}
## [1] "you're among the best 20 percent"
# Embedded control structure: fix the error
if (mean(my_class) < 75) {
if (mean(my_class) > me) {
print("average year, but still smarter than me")
} else {
print("average year, but I'm not that bad")
}
} else {
if (mean(my_class) > me) {
print("smart year, even smarter than me")
} else {
print("smart year, but I am smarter")
}
}
## [1] "smart year, but I am smarter"
# Create top_grades
top_grades <- my_class[my_class >= 85]
# Create worst_grades
worst_grades <- my_class[my_class < 65]
# Write conditional statement
if (length(top_grades) > length(worst_grades)) { print("top grades prevail") }
## [1] "top grades prevail"
Hadley and Charlotte Wickham led a course on writing functions in R. Broadly, the course includes advice on when/how to use functions, as well as specific advice about commands available through library(purrr).
Key pieces of advice include:
John Chambers gave a few useful slogans about functions:
Each function has three components:
Only the LAST evaluated expression is returned. The use of return() is recommended only for early-returns in a special case (for example, when a break() will be called).
Further, functions can be written anonymously on the command line, such as (function (x) {x + 1}) (1:5). A function should only depend on arguments passed to it, not variables from a parent enviornment. Every time the function is called, it receives a clean working environment. Once it finishes, its variables are no longer available unless they were returned (either by default as the last operation, or by way of return()):
# Components of a function
args(rnorm)
## function (n, mean = 0, sd = 1)
## NULL
formals(rnorm)
## $n
##
##
## $mean
## [1] 0
##
## $sd
## [1] 1
body(rnorm)
## .Call(C_rnorm, n, mean, sd)
environment(rnorm)
## <environment: namespace:stats>
# What is passed back
funDummy <- function(x) {
if (x <= 2) {
print("That is too small")
return(3) # This ends the function by convention
}
ceiling(x) # This is the defaulted return() value if nothing happened to prevent the code getting here
}
funDummy(1)
## [1] "That is too small"
## [1] 3
funDummy(5)
## [1] 5
# Anonymous functions
(function (x) {x + 1}) (1:5)
## [1] 2 3 4 5 6
The course includes some insightful discussion of vectors. As it happens, lists and data frames are just special collections of vectors in R. Each column of a data frame is a vector, while each element of a list is either 1) an embedded data frame (which is eventually a vector by way of columns), 2) an embedded list (which is eventually a vector by way of recursion), or 3) an actual vector.
The atomic vectors are of types logical, integer, character, and double; complex and raw are rarer types that are also available. Lists are just recursive vectors, which is to say that lists can contain other lists and can be hetergeneous. To explore vectors, you have:
Note that NULL is the absence of a vector and has length 0. NA is the absence of an element in the vector and has length 1. All math operations with NA return NA; for example NA == NA will return NA.
There are some good tips on extracting element from a list:
# Data types
data(mtcars)
str(mtcars)
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
typeof(mtcars) # n.b. that this is technically a "list"
## [1] "list"
length(mtcars)
## [1] 11
# NULL and NA
length(NULL)
## [1] 0
typeof(NULL)
## [1] "NULL"
length(NA)
## [1] 1
typeof(NA)
## [1] "logical"
NULL == NULL
## logical(0)
NULL == NA
## logical(0)
NA == NA
## [1] NA
is.null(NULL)
## [1] TRUE
is.null(NA)
## [1] FALSE
is.na(NULL)
## Warning in is.na(NULL): is.na() applied to non-(list or vector) of type
## 'NULL'
## logical(0)
is.na(NA)
## [1] TRUE
# Extraction
mtcars[["mpg"]][1:5]
## [1] 21.0 21.0 22.8 21.4 18.7
mtcars[[2]][1:5]
## [1] 6 6 4 6 8
mtcars$hp[1:5]
## [1] 110 110 93 110 175
# Relevant lengths
seq_along(mtcars)
## [1] 1 2 3 4 5 6 7 8 9 10 11
x <- data.frame()
seq_along(x)
## integer(0)
length(seq_along(x))
## [1] 0
foo <- function(x) { for (eachCol in seq_along(x)) { print(typeof(x[[eachCol]])) }}
foo(mtcars)
## [1] "double"
## [1] "double"
## [1] "double"
## [1] "double"
## [1] "double"
## [1] "double"
## [1] "double"
## [1] "double"
## [1] "double"
## [1] "double"
## [1] "double"
foo(x) # Note that this does nothing!
data(airquality)
str(airquality)
## 'data.frame': 153 obs. of 6 variables:
## $ Ozone : int 41 36 12 18 NA 28 23 19 8 NA ...
## $ Solar.R: int 190 118 149 313 NA NA 299 99 19 194 ...
## $ Wind : num 7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
## $ Temp : int 67 72 74 62 56 66 65 59 61 69 ...
## $ Month : int 5 5 5 5 5 5 5 5 5 5 ...
## $ Day : int 1 2 3 4 5 6 7 8 9 10 ...
foo(airquality)
## [1] "integer"
## [1] "integer"
## [1] "double"
## [1] "integer"
## [1] "integer"
## [1] "integer"
# Range command
mpgRange <- range(mtcars$mpg)
mpgRange
## [1] 10.4 33.9
mpgScale <- (mtcars$mpg - mpgRange[1]) / (mpgRange[2] - mpgRange[1])
summary(mpgScale)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.2138 0.3745 0.4124 0.5277 1.0000
The typical arguments in a function use a consistent, simple naming function:
Data arguments should come before detail arguments, and detail arguments should be given reasonable default values. See for example rnorm(n, mean=0, sd=1). The number requested (n) must be specified, but defaults are available for the details (mean and standard deviation).
Functions can be passed as arguments to other functions, which is at the core of functional programming. For example:
do_math <- function(x, fun) { fun(x) }
do_math(1:10, fun=mean)
## [1] 5.5
do_math(1:10, fun=sd)
## [1] 3.02765
The library(purrr) takes advantage of this, and in a type-consistent manner. There are functions for:
The general arguments are .x (a list or an atomic vector) and .f which can be either a function, an anonymous function (formula with ~), or an extractor .x[[.f]]. For example:
library(purrr)
## Warning: package 'purrr' was built under R version 3.2.5
library(RColorBrewer) # Need to have in non-cached chunk for later
data(mtcars)
# Create output as a list
map(.x=mtcars, .f=sum)
## $mpg
## [1] 642.9
##
## $cyl
## [1] 198
##
## $disp
## [1] 7383.1
##
## $hp
## [1] 4694
##
## $drat
## [1] 115.09
##
## $wt
## [1] 102.952
##
## $qsec
## [1] 571.16
##
## $vs
## [1] 14
##
## $am
## [1] 13
##
## $gear
## [1] 118
##
## $carb
## [1] 90
# Create same output as a double
map_dbl(.x=mtcars, .f=sum)
## mpg cyl disp hp drat wt qsec vs
## 642.900 198.000 7383.100 4694.000 115.090 102.952 571.160 14.000
## am gear carb
## 13.000 118.000 90.000
# Create same output as integer
# map_int(.x=mtcars, .f=sum) . . . this would bomb since it is not actually an integere
map_int(.x=mtcars, .f=function(x) { as.integer(round(sum(x), 0)) } )
## mpg cyl disp hp drat wt qsec vs am gear carb
## 643 198 7383 4694 115 103 571 14 13 118 90
# Same thing but using an anonymous function with ~ and .
map_int(.x=mtcars, .f = ~ as.integer(round(sum(.), 0)) )
## mpg cyl disp hp drat wt qsec vs am gear carb
## 643 198 7383 4694 115 103 571 14 13 118 90
# Create a boolean vector
map_lgl(.x=mtcars, .f = ~ ifelse(sum(.) > 200, TRUE, FALSE) )
## mpg cyl disp hp drat wt qsec vs am gear carb
## TRUE FALSE TRUE TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
# Create a character vector
map_chr(.x=mtcars, .f = ~ ifelse(sum(.) > 200, "Large", "Not So Large") )
## mpg cyl disp hp drat
## "Large" "Not So Large" "Large" "Large" "Not So Large"
## wt qsec vs am gear
## "Not So Large" "Large" "Not So Large" "Not So Large" "Not So Large"
## carb
## "Not So Large"
# Use the extractor [pulls the first row]
map_dbl(.x=mtcars, .f=1)
## mpg cyl disp hp drat wt qsec vs am gear
## 21.00 6.00 160.00 110.00 3.90 2.62 16.46 0.00 1.00 4.00
## carb
## 4.00
# Example from help file using chaining
mtcars %>%
split(.$cyl) %>%
map(~ lm(mpg ~ wt, data = .x)) %>%
map(summary) %>%
map_dbl("r.squared")
## 4 6 8
## 0.5086326 0.4645102 0.4229655
# Using sapply
sapply(split(mtcars, mtcars$cyl), FUN=function(.x) { summary(lm(mpg ~ wt, data=.x))$r.squared } )
## 4 6 8
## 0.5086326 0.4645102 0.4229655
# Use the extractor from a list
cylSplit <- split(mtcars, mtcars$cyl)
map(cylSplit, "mpg")
## $`4`
## [1] 22.8 24.4 22.8 32.4 30.4 33.9 21.5 27.3 26.0 30.4 21.4
##
## $`6`
## [1] 21.0 21.0 21.4 18.1 19.2 17.8 19.7
##
## $`8`
## [1] 18.7 14.3 16.4 17.3 15.2 10.4 10.4 14.7 15.5 15.2 13.3 19.2 15.8 15.0
map(cylSplit, "cyl")
## $`4`
## [1] 4 4 4 4 4 4 4 4 4 4 4
##
## $`6`
## [1] 6 6 6 6 6 6 6
##
## $`8`
## [1] 8 8 8 8 8 8 8 8 8 8 8 8 8 8
The purrr library has several additional interesting functions:
Some example code includes:
library(purrr) # Called again for clarity; all these key functions belong to purrr
# safely(.f, otherwise = NULL, quiet = TRUE)
safe_log10 <- safely(log10)
map(list(0, 1, 10, "a"), .f=safe_log10)
## [[1]]
## [[1]]$result
## [1] -Inf
##
## [[1]]$error
## NULL
##
##
## [[2]]
## [[2]]$result
## [1] 0
##
## [[2]]$error
## NULL
##
##
## [[3]]
## [[3]]$result
## [1] 1
##
## [[3]]$error
## NULL
##
##
## [[4]]
## [[4]]$result
## NULL
##
## [[4]]$error
## <simpleError in .f(...): non-numeric argument to mathematical function>
# possibly(.f, otherwise, quiet = TRUE)
poss_log10 <- possibly(log10, otherwise=NaN)
map_dbl(list(0, 1, 10, "a"), .f=poss_log10)
## [1] -Inf 0 1 NaN
# transpose() - note that this can become masked by data.table::transpose() so be careful
purrr::transpose(map(list(0, 1, 10, "a"), .f=safe_log10))
## $result
## $result[[1]]
## [1] -Inf
##
## $result[[2]]
## [1] 0
##
## $result[[3]]
## [1] 1
##
## $result[[4]]
## NULL
##
##
## $error
## $error[[1]]
## NULL
##
## $error[[2]]
## NULL
##
## $error[[3]]
## NULL
##
## $error[[4]]
## <simpleError in .f(...): non-numeric argument to mathematical function>
purrr::transpose(map(list(0, 1, 10, "a"), .f=safe_log10))$result
## [[1]]
## [1] -Inf
##
## [[2]]
## [1] 0
##
## [[3]]
## [1] 1
##
## [[4]]
## NULL
unlist(purrr::transpose(map(list(0, 1, 10, "a"), .f=safe_log10))$result)
## [1] -Inf 0 1
purrr::transpose(map(list(0, 1, 10, "a"), .f=safe_log10))$error
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
##
## [[4]]
## <simpleError in .f(...): non-numeric argument to mathematical function>
map_lgl(purrr::transpose(map(list(0, 1, 10, "a"), .f=safe_log10))$error, is.null)
## [1] TRUE TRUE TRUE FALSE
# map2(.x, .y, .f)
map2(list(5, 10, 20), list(1, 2, 3), .f=rnorm) # rnorm(5, 1), rnorm(10, 2), and rnorm(20, 3)
## [[1]]
## [1] 0.41176421 2.00652288 0.06152025 0.46963873 1.15436157
##
## [[2]]
## [1] 0.006821057 2.902712636 1.436150816 1.377836302 2.625075832
## [6] 0.680797806 0.313499192 0.718062969 2.820989906 3.134207742
##
## [[3]]
## [1] 3.3716474 2.9393673 1.8648940 3.2343343 2.1849894 2.0697179 1.0872014
## [8] 3.4970403 3.5769694 3.0999340 1.2033341 0.9839011 2.9820314 1.7116383
## [15] 0.8779558 1.6990118 2.5914013 2.3587803 3.7460957 1.2980312
# pmap(.l, .f)
pmap(list(n=list(5, 10, 20), mean=list(1, 5, 10), sd=list(0.1, 0.5, 0.1)), rnorm)
## [[1]]
## [1] 1.0151570 1.1573287 1.0628581 0.8805484 0.9418430
##
## [[2]]
## [1] 5.032920 4.689799 5.423525 5.265610 4.727383 5.252325 5.166292
## [8] 4.861745 5.135408 4.106679
##
## [[3]]
## [1] 9.854138 10.090939 10.045554 9.970755 10.092487 9.769531 10.140064
## [8] 9.834716 10.196817 10.047367 10.054093 10.006439 10.142002 10.092259
## [15] 10.222459 10.082440 10.067818 9.993884 10.078380 9.936942
# invoke_map(.f, .x, ...)
invoke_map(list(rnorm, runif, rexp), n=5)
## [[1]]
## [1] -0.96707137 0.08207476 1.39498168 0.60287972 -0.15130461
##
## [[2]]
## [1] 0.01087442 0.02980483 0.81443586 0.88438198 0.67976034
##
## [[3]]
## [1] 0.2646751 1.3233260 1.1079261 1.3504952 0.6795524
# walk() is for the side effects of a function
x <- list(1, "\n\ta\n", 3)
x %>% walk(cat)
## 1
## a
## 3
# Chaining is available by way of the %>% operator
pretty_titles <- c("N(0, 1)", "Uniform(0, 1)", "Exponential (rate=1)")
set.seed(1607120947)
x <- invoke_map(list(rnorm, runif, rexp), n=5000)
foo <- function(x) { map(x, .f=summary) }
par(mfrow=c(1, 3))
pwalk(list(x=x, main=pretty_titles), .f=hist, xlab="", col="light blue") %>% map(.f=foo)
## $x
## $x[[1]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.711000 -0.637800 -0.000217 0.006543 0.671800 3.633000
##
## $x[[2]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0001241 0.2518000 0.5012000 0.5028000 0.7566000 0.9999000
##
## $x[[3]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00001 0.29140 0.68340 0.98260 1.37900 8.46300
##
##
## $main
## $main[[1]]
## Length Class Mode
## 1 character character
##
## $main[[2]]
## Length Class Mode
## 1 character character
##
## $main[[3]]
## Length Class Mode
## 1 character character
par(mfrow=c(1, 1))
There are two potentially desirable behaviors with functions:
As a best practice, R functions that will be used for programming (as opposed to interactive command line work) should be written in a robust manner. Three standard problems should be avoided/mitigated:
There are several methods available for throwing errors within an R function:
One example that commonly creates surprises is the [,] operator for extraction. Adding [ , , drop=FALSE] ensures that you will still have what you passed (e.g., a matrix or data frame) rather than conversion of a chunk of data to a vector.
Another common source of error is sapply() which will return a vector when it can and a list otherwise. The map() and map_typ() functions in purrr are designed to be type-stable; if the output is not as expected, they will error out.
Non-standard evaluations take advantage of the existence of something else (e.g., a variable in the parent environment that has not been passed). This can cause confusion and improper results.
Pure functions have the key properties that 1) their output depends only on their inputs, and 2) they do not impact the outside world other than by way of their return value. Specifically, the function should not depend on how the user has configured their global options as shown in options(), nor should it modify those options() settings upon return of control to the parent environment.
A few examples are shown below:
# Throwing errors to stop a function (cannot actually run these!)
# stopifnot(FALSE)
# if (FALSE) { stop("Error: ", call.=FALSE) }
# if (FALSE) { stop("Error: This condition needed to be set as TRUE", call.=FALSE) }
# Behavior of [,] and [,,drop=FALSE]
mtxTest <- matrix(data=1:9, nrow=3, byrow=TRUE)
class(mtxTest)
## [1] "matrix"
mtxTest[1, ]
## [1] 1 2 3
class(mtxTest[1, ])
## [1] "integer"
mtxTest[1, , drop=FALSE]
## [,1] [,2] [,3]
## [1,] 1 2 3
class(mtxTest[1, , drop=FALSE])
## [1] "matrix"
# Behavior of sapply() - may not get what you are expecting
foo <- function(x) { x^2 }
sapply(1:5, FUN=foo)
## [1] 1 4 9 16 25
class(sapply(1:5, FUN=foo))
## [1] "numeric"
sapply(c(1, list(1.5, 2, 2.5), 3, 4, 5), FUN=foo)
## [1] 1.00 2.25 4.00 6.25 9.00 16.00 25.00
class(sapply(c(1, list(1.5, 2, 2.5), 3, 4, 5), FUN=foo))
## [1] "numeric"
sapply(list(1, c(1.5, 2, 2.5), 3, 4, 5), FUN=foo)
## [[1]]
## [1] 1
##
## [[2]]
## [1] 2.25 4.00 6.25
##
## [[3]]
## [1] 9
##
## [[4]]
## [1] 16
##
## [[5]]
## [1] 25
class(sapply(list(1, c(1.5, 2, 2.5), 3, 4, 5), FUN=foo))
## [1] "list"
This was a very enjoyable and instructive course.
Chapter 1 - Introduction to Object Oriented Programming (OOP)
Typical R usage involves a functional programming style - data to function to new data to new function to newer values and etc. Object Oriented Programming (OOP) instead involves thinking about the data structures (objects), their functionalities, and the like:
There are nine different options for OOP in R:
How does R distinguish types of variables?
Assigning Classes and Implicit Classes:
Example code includes:
# Create these variables
a_numeric_vector <- rlnorm(50)
a_factor <- factor(
sample(c(LETTERS[1:5], NA), 50, replace = TRUE)
)
a_data_frame <- data.frame(
n = a_numeric_vector,
f = a_factor
)
a_linear_model <- lm(dist ~ speed, cars)
# Call summary() on the numeric vector
summary(a_numeric_vector)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.08694 0.58120 1.06400 1.63500 1.48800 7.43600
# Do the same for the other three objects
summary(a_factor)
## A B C D E NA's
## 5 9 8 11 11 6
summary(a_data_frame)
## n f
## Min. :0.08694 A : 5
## 1st Qu.:0.58121 B : 9
## Median :1.06361 C : 8
## Mean :1.63546 D :11
## 3rd Qu.:1.48764 E :11
## Max. :7.43560 NA's: 6
summary(a_linear_model)
##
## Call:
## lm(formula = dist ~ speed, data = cars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -29.069 -9.525 -2.272 9.215 43.201
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.5791 6.7584 -2.601 0.0123 *
## speed 3.9324 0.4155 9.464 1.49e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.38 on 48 degrees of freedom
## Multiple R-squared: 0.6511, Adjusted R-squared: 0.6438
## F-statistic: 89.57 on 1 and 48 DF, p-value: 1.49e-12
type_info <-
function(x)
{
c(
class = class(x),
typeof = typeof(x),
mode = mode(x),
storage.mode = storage.mode(x)
)
}
# Create list of example variables
some_vars <- list(
an_integer_vector = rpois(24, lambda = 5),
a_numeric_vector = rbeta(24, shape1 = 1, shape2 = 1),
an_integer_array = array(rbinom(24, size = 8, prob = 0.5), dim = c(2, 3, 4)),
a_numeric_array = array(rweibull(24, shape = 1, scale = 1), dim = c(2, 3, 4)),
a_data_frame = data.frame(int = rgeom(24, prob = 0.5), num = runif(24)),
a_factor = factor(month.abb),
a_formula = y ~ x,
a_closure_function = mean,
a_builtin_function = length,
a_special_function = `if`
)
# Loop over some_vars calling type_info() on each element to explore them
lapply(some_vars, FUN=type_info)
## $an_integer_vector
## class typeof mode storage.mode
## "integer" "integer" "numeric" "integer"
##
## $a_numeric_vector
## class typeof mode storage.mode
## "numeric" "double" "numeric" "double"
##
## $an_integer_array
## class typeof mode storage.mode
## "array" "integer" "numeric" "integer"
##
## $a_numeric_array
## class typeof mode storage.mode
## "array" "double" "numeric" "double"
##
## $a_data_frame
## class typeof mode storage.mode
## "data.frame" "list" "list" "list"
##
## $a_factor
## class typeof mode storage.mode
## "factor" "integer" "numeric" "integer"
##
## $a_formula
## class typeof mode storage.mode
## "formula" "language" "call" "language"
##
## $a_closure_function
## class typeof mode storage.mode
## "function" "closure" "function" "function"
##
## $a_builtin_function
## class typeof mode storage.mode
## "function" "builtin" "function" "function"
##
## $a_special_function
## class typeof mode storage.mode
## "function" "special" "function" "function"
whiteChess <- list(king="g1", queen="h4", bishops=c("c2", "g5"), knights=character(0), rooks=c("f1", "f6"), pawns=c("a2", "b2", "d4", "e3", "g2", "h2"))
blackChess <- list(king="g8", queen="d7", bishops=c("b7", "e7"), knights=character(0), rooks=c("a6", "f8"), pawns=c("a5", "c3", "c4", "d5", "f7", "g6"))
chess <- list(white=whiteChess, black=blackChess)
# Explore the structure of chess
str(chess)
## List of 2
## $ white:List of 6
## ..$ king : chr "g1"
## ..$ queen : chr "h4"
## ..$ bishops: chr [1:2] "c2" "g5"
## ..$ knights: chr(0)
## ..$ rooks : chr [1:2] "f1" "f6"
## ..$ pawns : chr [1:6] "a2" "b2" "d4" "e3" ...
## $ black:List of 6
## ..$ king : chr "g8"
## ..$ queen : chr "d7"
## ..$ bishops: chr [1:2] "b7" "e7"
## ..$ knights: chr(0)
## ..$ rooks : chr [1:2] "a6" "f8"
## ..$ pawns : chr [1:6] "a5" "c3" "c4" "d5" ...
# Override the class of chess
class(chess) <- "chess_game"
# Is chess still a list?
is.list(chess)
## [1] TRUE
# How many pieces are left on the board?
length(unlist(chess))
## [1] 24
type_info(chess) # note that typeof(), mode(), and storage.mode() all remained as list
## class typeof mode storage.mode
## "chess_game" "list" "list" "list"
Chapter 2 - Using S3
Function overloading is the property of a function of input-dependent behavior:
Methodical Thinking - determining which methods are available for an S3 generic:
S3 and Primitive Functions:
Too Much Class:
Example code includes:
# Create get_n_elements
get_n_elements <- function(x, ...) {
UseMethod("get_n_elements")
}
# View get_n_elements
get_n_elements
## function(x, ...) {
## UseMethod("get_n_elements")
## }
# Create a data.frame method for get_n_elements
get_n_elements.data.frame <- function(x, ...) {
nrow(x) * ncol(x)
}
# Call the method on the sleep dataset
n_elements_sleep <- get_n_elements(sleep)
# View the result
n_elements_sleep
## [1] 60
# View pre-defined objects
# ls.str() ## Do not run, this can be a cluster with many variables loaded . . .
# Create a default method for get_n_elements
get_n_elements.default <- function(x, ...) {
length(unlist(x))
}
# Call the method on the ability.cov dataset
n_elements_ability.cov <- get_n_elements(ability.cov)
# Find methods for print
methods("print")
## [1] print.acf*
## [2] print.AES*
## [3] print.anova*
## [4] print.aov*
## [5] print.aovlist*
## [6] print.ar*
## [7] print.Arima*
## [8] print.arima0*
## [9] print.AsIs
## [10] print.aspell*
## [11] print.aspell_inspect_context*
## [12] print.bibentry*
## [13] print.Bibtex*
## [14] print.browseVignettes*
## [15] print.by
## [16] print.bytes*
## [17] print.changedFiles*
## [18] print.check_code_usage_in_package*
## [19] print.check_compiled_code*
## [20] print.check_demo_index*
## [21] print.check_depdef*
## [22] print.check_dotInternal*
## [23] print.check_make_vars*
## [24] print.check_nonAPI_calls*
## [25] print.check_package_code_assign_to_globalenv*
## [26] print.check_package_code_attach*
## [27] print.check_package_code_data_into_globalenv*
## [28] print.check_package_code_startup_functions*
## [29] print.check_package_code_syntax*
## [30] print.check_package_code_unload_functions*
## [31] print.check_package_compact_datasets*
## [32] print.check_package_CRAN_incoming*
## [33] print.check_package_datasets*
## [34] print.check_package_depends*
## [35] print.check_package_description*
## [36] print.check_package_description_encoding*
## [37] print.check_package_license*
## [38] print.check_packages_in_dir*
## [39] print.check_packages_in_dir_changes*
## [40] print.check_packages_used*
## [41] print.check_po_files*
## [42] print.check_Rd_contents*
## [43] print.check_Rd_line_widths*
## [44] print.check_Rd_metadata*
## [45] print.check_Rd_xrefs*
## [46] print.check_so_symbols*
## [47] print.check_T_and_F*
## [48] print.check_url_db*
## [49] print.check_vignette_index*
## [50] print.checkDocFiles*
## [51] print.checkDocStyle*
## [52] print.checkFF*
## [53] print.checkRd*
## [54] print.checkReplaceFuns*
## [55] print.checkS3methods*
## [56] print.checkTnF*
## [57] print.checkVignettes*
## [58] print.citation*
## [59] print.codoc*
## [60] print.codocClasses*
## [61] print.codocData*
## [62] print.colorConverter*
## [63] print.compactPDF*
## [64] print.condition
## [65] print.connection
## [66] print.data.frame
## [67] print.Date
## [68] print.default
## [69] print.dendrogram*
## [70] print.density*
## [71] print.difftime
## [72] print.dist*
## [73] print.Dlist
## [74] print.DLLInfo
## [75] print.DLLInfoList
## [76] print.DLLRegisteredRoutines
## [77] print.dummy_coef*
## [78] print.dummy_coef_list*
## [79] print.ecdf*
## [80] print.factanal*
## [81] print.factor
## [82] print.family*
## [83] print.fileSnapshot*
## [84] print.findLineNumResult*
## [85] print.formula*
## [86] print.fseq*
## [87] print.ftable*
## [88] print.function
## [89] print.getAnywhere*
## [90] print.glm*
## [91] print.hclust*
## [92] print.help_files_with_topic*
## [93] print.hexmode
## [94] print.HoltWinters*
## [95] print.hsearch*
## [96] print.hsearch_db*
## [97] print.htest*
## [98] print.html*
## [99] print.infl*
## [100] print.integrate*
## [101] print.isoreg*
## [102] print.kmeans*
## [103] print.knitr_kable*
## [104] print.Latex*
## [105] print.LaTeX*
## [106] print.lazy*
## [107] print.libraryIQR
## [108] print.listof
## [109] print.lm*
## [110] print.loadings*
## [111] print.loess*
## [112] print.logLik*
## [113] print.ls_str*
## [114] print.medpolish*
## [115] print.MethodsFunction*
## [116] print.mtable*
## [117] print.NativeRoutineList
## [118] print.news_db*
## [119] print.nls*
## [120] print.noquote
## [121] print.numeric_version
## [122] print.object_size*
## [123] print.octmode
## [124] print.packageDescription*
## [125] print.packageInfo
## [126] print.packageIQR*
## [127] print.packageStatus*
## [128] print.pairwise.htest*
## [129] print.PDF_Array*
## [130] print.PDF_Dictionary*
## [131] print.pdf_doc*
## [132] print.pdf_fonts*
## [133] print.PDF_Indirect_Reference*
## [134] print.pdf_info*
## [135] print.PDF_Keyword*
## [136] print.PDF_Name*
## [137] print.PDF_Stream*
## [138] print.PDF_String*
## [139] print.person*
## [140] print.POSIXct
## [141] print.POSIXlt
## [142] print.power.htest*
## [143] print.ppr*
## [144] print.prcomp*
## [145] print.princomp*
## [146] print.proc_time
## [147] print.raster*
## [148] print.Rd*
## [149] print.recordedplot*
## [150] print.restart
## [151] print.RGBcolorConverter*
## [152] print.rle
## [153] print.roman*
## [154] print.SavedPlots*
## [155] print.sessionInfo*
## [156] print.shiny.tag*
## [157] print.shiny.tag.list*
## [158] print.simple.list
## [159] print.smooth.spline*
## [160] print.socket*
## [161] print.srcfile
## [162] print.srcref
## [163] print.stepfun*
## [164] print.stl*
## [165] print.StructTS*
## [166] print.subdir_tests*
## [167] print.summarize_CRAN_check_status*
## [168] print.summary.aov*
## [169] print.summary.aovlist*
## [170] print.summary.ecdf*
## [171] print.summary.glm*
## [172] print.summary.lm*
## [173] print.summary.loess*
## [174] print.summary.manova*
## [175] print.summary.nls*
## [176] print.summary.packageStatus*
## [177] print.summary.ppr*
## [178] print.summary.prcomp*
## [179] print.summary.princomp*
## [180] print.summary.table
## [181] print.summaryDefault
## [182] print.table
## [183] print.tables_aov*
## [184] print.terms*
## [185] print.ts*
## [186] print.tskernel*
## [187] print.TukeyHSD*
## [188] print.tukeyline*
## [189] print.tukeysmooth*
## [190] print.undoc*
## [191] print.vignette*
## [192] print.warnings
## [193] print.xgettext*
## [194] print.xngettext*
## [195] print.xtabs*
## see '?methods' for accessing help and source code
# Commented due to no dataset "hair" on my machine
# View the structure of hair
# str(hair)
# What primitive generics are available?
.S3PrimitiveGenerics
## [1] "anyNA" "as.character" "as.complex" "as.double"
## [5] "as.environment" "as.integer" "as.logical" "as.numeric"
## [9] "as.raw" "c" "dim" "dim<-"
## [13] "dimnames" "dimnames<-" "is.array" "is.finite"
## [17] "is.infinite" "is.matrix" "is.na" "is.nan"
## [21] "is.numeric" "length" "length<-" "levels<-"
## [25] "names" "names<-" "rep" "seq.int"
## [29] "xtfrm"
# Does length.hairstylist exist?
# exists("length.hairstylist")
# What is the length of hair?
# length(hair)
kitty <- "Miaow!"
# Assign classes
class(kitty) <- c("cat", "mammal", "character")
# Does kitty inherit from cat/mammal/character vector?
inherits(kitty, "cat")
## [1] TRUE
inherits(kitty, "mammal")
## [1] TRUE
inherits(kitty, "character")
## [1] TRUE
# Is kitty a character vector?
is.character(kitty)
## [1] TRUE
# Does kitty inherit from dog?
inherits(kitty, "dog")
## [1] FALSE
what_am_i <-
function(x, ...)
{
UseMethod("what_am_i")
}
# cat method
what_am_i.cat <- function(x, ...)
{
# Write a message
print("I'm a cat")
# Call NextMethod
NextMethod("what_am_i")
}
# mammal method
what_am_i.mammal <- function(x, ...)
{
# Write a message
print("I'm a mammal")
# Call NextMethod
NextMethod("what_am_i")
}
# character method
what_am_i.character <- function(x, ...)
{
# Write a message
print("I'm a character vector")
}
# Call what_am_i()
what_am_i(kitty)
## [1] "I'm a cat"
## [1] "I'm a mammal"
## [1] "I'm a character vector"
Chapter 3 - Using R6
Object factory - R6 provides a means of storing data and objects within the same variable:
Hiding Complexity with Encapsulation - should be able to use something even if the internal (hidden) functionality is very complicated:
Generally, data available in the “private” area of a class is not available to users:
Example code includes:
# Define microwave_oven_factory
microwave_oven_factory <- R6::R6Class(
"MicrowaveOven",
private=list(power_rating_watts=800)
)
# View the microwave_oven_factory
microwave_oven_factory
## <MicrowaveOven> object generator
## Public:
## clone: function
## Private:
## power_rating_watts: 800
## Parent env: <environment: R_GlobalEnv>
## Locked objects: TRUE
## Locked class: FALSE
## Portable: TRUE
# Make a new microwave oven
microwave_oven <- microwave_oven_factory$new()
# Add a cook method to the factory definition
microwave_oven_factory <- R6::R6Class(
"MicrowaveOven",
private = list(
power_rating_watts = 800
),
public = list(
cook = function(time_seconds) {
Sys.sleep(time_seconds)
print("Your food is cooked!")
}
)
)
# Create microwave oven object
a_microwave_oven <- microwave_oven_factory$new()
# Call cook method for 1 second
a_microwave_oven$cook(time_seconds=1)
## [1] "Your food is cooked!"
# Add a close_door() method
microwave_oven_factory <- R6::R6Class(
"MicrowaveOven",
private = list(
power_rating_watts = 800,
door_is_open = FALSE
),
public = list(
cook = function(time_seconds) {
Sys.sleep(time_seconds)
print("Your food is cooked!")
},
open_door = function() {
private$door_is_open = TRUE
},
close_door = function() {
private$door_is_open = FALSE
}
)
)
# Add an initialize method
microwave_oven_factory <- R6::R6Class(
"MicrowaveOven",
private = list(
power_rating_watts = 800,
door_is_open = FALSE
),
public = list(
cook = function(time_seconds) {
Sys.sleep(time_seconds)
print("Your food is cooked!")
},
open_door = function() {
private$door_is_open = TRUE
},
close_door = function() {
private$door_is_open = FALSE
},
# Add initialize() method here
initialize = function(power_rating_watts, door_is_open) {
if (!missing(power_rating_watts)) {
private$power_rating_watts <- power_rating_watts
}
if (!missing(door_is_open)) {
private$door_is_open <- door_is_open
}
}
)
)
# Make a microwave
a_microwave_oven <- microwave_oven_factory$new(power_rating_watts=650, door_is_open=TRUE)
# Add a binding for power rating
microwave_oven_factory <- R6::R6Class(
"MicrowaveOven",
private = list(
..power_rating_watts = 800
),
active = list(
# add the binding here
power_rating_watts = function() {
private$..power_rating_watts
}
)
)
# Make a microwave
a_microwave_oven <- microwave_oven_factory$new()
# Get the power rating
a_microwave_oven$power_rating_watts
## [1] 800
# Add a binding for power rating
microwave_oven_factory <- R6::R6Class(
"MicrowaveOven",
private = list(
..power_rating_watts = 800,
..power_level_watts = 800
),
# Add active list containing an active binding
active=list(
power_level_watts = function(value) {
if (missing(value)) {
private$..power_level_watts
} else {
assertive.types::assert_is_a_number(value, severity="warning")
assertive.numbers::assert_all_are_in_closed_range(value,
lower=0,
upper=private$..power_rating_watts,
severity="warning"
)
private$..power_level_watts <- value
}
}
)
)
# Make a microwave
a_microwave_oven <- microwave_oven_factory$new()
# Get the power level
a_microwave_oven$power_level_watts
## [1] 800
# Try to set the power level to "400"
a_microwave_oven$power_level_watts <- "400"
## Warning in (function (value) : is_a_number : value is not of class
## 'numeric'; it has class 'character'.
## Warning: Coercing value to class 'numeric'.
# Try to set the power level to 1600 watts
a_microwave_oven$power_level_watts <- 1600
## Warning in (function (value) : is_in_closed_range : value are not all in the range [0,800].
## There was 1 failure:
## Position Value Cause
## 1 1 1600 too high
# Set the power level to 400 watts
a_microwave_oven$power_level_watts <- 400
Chapter 4 - R6 Inheritance
Inheritance is an attempt to avoid “copy and paste” from one class to another (dependent, fancier, or the like) class:
Extend or Override to create additional functionality:
Multiple Levels of Inheritance - a can inherit from b that inherited from c and the like:
Example code includes:
microwave_oven_factory <-
R6::R6Class("MicrowaveOven",
private=list(..power_rating_watts=800,
..power_level_watts=800,
..door_is_open=FALSE
),
public=list(cook=function(time) Sys.sleep(time),
open_door=function() private$..door_is_open <- TRUE,
close_door = function() private$..door_is_open <- FALSE
),
active=list(power_rating_watts=function() private$..power_rating_watts,
power_level_watts = function(value) {
if (missing(value)) {
private$..power_level_watts
} else {
private$..power_level_watts <-
max(0,
min(private$..power_rating_watts,
as.numeric(value)
)
)
}
}
)
)
# Explore the microwave oven class
microwave_oven_factory
## <MicrowaveOven> object generator
## Public:
## cook: function
## open_door: function
## close_door: function
## clone: function
## Active bindings:
## power_rating_watts: function
## power_level_watts: function
## Private:
## ..power_rating_watts: 800
## ..power_level_watts: 800
## ..door_is_open: FALSE
## Parent env: <environment: R_GlobalEnv>
## Locked objects: TRUE
## Locked class: FALSE
## Portable: TRUE
# Define a fancy microwave class inheriting from microwave oven
fancy_microwave_oven_factory <- R6::R6Class(
"FancyMicrowaveOven",
inherit=microwave_oven_factory
)
# Explore microwave oven classes
microwave_oven_factory
## <MicrowaveOven> object generator
## Public:
## cook: function
## open_door: function
## close_door: function
## clone: function
## Active bindings:
## power_rating_watts: function
## power_level_watts: function
## Private:
## ..power_rating_watts: 800
## ..power_level_watts: 800
## ..door_is_open: FALSE
## Parent env: <environment: R_GlobalEnv>
## Locked objects: TRUE
## Locked class: FALSE
## Portable: TRUE
fancy_microwave_oven_factory
## <FancyMicrowaveOven> object generator
## Public:
## clone: function
## Parent env: <environment: R_GlobalEnv>
## Locked objects: TRUE
## Locked class: FALSE
## Portable: TRUE
# Instantiate both types of microwave
a_microwave_oven <- microwave_oven_factory$new()
a_fancy_microwave <- fancy_microwave_oven_factory$new()
# Get power rating for each microwave
microwave_power_rating <- a_microwave_oven$power_level_watts
fancy_microwave_power_rating <- a_fancy_microwave$power_level_watts
# Verify that these are the same
identical(microwave_power_rating, fancy_microwave_power_rating)
## [1] TRUE
# Cook with each microwave
a_microwave_oven$cook(1)
a_fancy_microwave$cook(1)
# Explore microwave oven class
microwave_oven_factory
## <MicrowaveOven> object generator
## Public:
## cook: function
## open_door: function
## close_door: function
## clone: function
## Active bindings:
## power_rating_watts: function
## power_level_watts: function
## Private:
## ..power_rating_watts: 800
## ..power_level_watts: 800
## ..door_is_open: FALSE
## Parent env: <environment: R_GlobalEnv>
## Locked objects: TRUE
## Locked class: FALSE
## Portable: TRUE
# Extend the class definition
fancy_microwave_oven_factory <- R6::R6Class(
"FancyMicrowaveOven",
inherit = microwave_oven_factory,
# Add a public list with a cook baked potato method
public = list(
cook_baked_potato=function() {
self$cook(3)
}
)
)
# Instantiate a fancy microwave
a_fancy_microwave <- fancy_microwave_oven_factory$new()
# Call the cook_baked_potato() method
a_fancy_microwave$cook_baked_potato()
# Explore microwave oven class
microwave_oven_factory
## <MicrowaveOven> object generator
## Public:
## cook: function
## open_door: function
## close_door: function
## clone: function
## Active bindings:
## power_rating_watts: function
## power_level_watts: function
## Private:
## ..power_rating_watts: 800
## ..power_level_watts: 800
## ..door_is_open: FALSE
## Parent env: <environment: R_GlobalEnv>
## Locked objects: TRUE
## Locked class: FALSE
## Portable: TRUE
# Update the class definition
fancy_microwave_oven_factory <- R6::R6Class(
"FancyMicrowaveOven",
inherit = microwave_oven_factory,
# Add a public list with a cook method
public = list(
cook = function(time_seconds) {
super$cook(time_seconds)
message("Enjoy your dinner!")
}
)
)
# Instantiate a fancy microwave
a_fancy_microwave <- fancy_microwave_oven_factory$new()
# Call the cook() method
a_fancy_microwave$cook(1)
## Enjoy your dinner!
# Expose the parent functionality
fancy_microwave_oven_factory <- R6::R6Class(
"FancyMicrowaveOven",
inherit = microwave_oven_factory,
public = list(
cook_baked_potato = function() {
self$cook(3)
},
cook = function(time_seconds) {
super$cook(time_seconds)
message("Enjoy your dinner!")
}
),
# Add an active element with a super_ binding
active = list(
super_ = function() super
)
)
# Instantiate a fancy microwave
a_fancy_microwave <- fancy_microwave_oven_factory$new()
# Call the super_ binding
a_fancy_microwave$super_
## <environment: 0x000000000a54dd18>
ascii_pizza_slice <- " __\n // \"\"--.._\n|| (_) _ \"-._\n|| _ (_) '-.\n|| (_) __..-'\n \\\\__..--\"\""
# Explore other microwaves
microwave_oven_factory
## <MicrowaveOven> object generator
## Public:
## cook: function
## open_door: function
## close_door: function
## clone: function
## Active bindings:
## power_rating_watts: function
## power_level_watts: function
## Private:
## ..power_rating_watts: 800
## ..power_level_watts: 800
## ..door_is_open: FALSE
## Parent env: <environment: R_GlobalEnv>
## Locked objects: TRUE
## Locked class: FALSE
## Portable: TRUE
fancy_microwave_oven_factory
## <FancyMicrowaveOven> object generator
## Public:
## cook_baked_potato: function
## cook: function
## clone: function
## Active bindings:
## super_: function
## Parent env: <environment: R_GlobalEnv>
## Locked objects: TRUE
## Locked class: FALSE
## Portable: TRUE
# Define a high-end microwave oven class
high_end_microwave_oven_factory <- R6::R6Class(
"HighEndMicrowaveOven",
inherit=fancy_microwave_oven_factory,
public=list(
cook=function(time_seconds) {
super$super_$cook(time_seconds)
message(ascii_pizza_slice)
}
)
)
# Instantiate a high-end microwave oven
a_high_end_microwave <- high_end_microwave_oven_factory$new()
# Use it to cook for one second
a_high_end_microwave$cook(1)
## __
## // ""--.._
## || (_) _ "-._
## || _ (_) '-.
## || (_) __..-'
## \\__..--""
Chapter 5 - Advanced R6 Usage
Environments, Reference Behavior, and Static Fields:
Cloning Objects - R6 is built using environments, so the “copy by reference” is part and parcel of R6:
Shut it Down - if the R6 object is linked to any databases or has any side effects, it can be a good idea to shut it down:
Example code includes:
# Define a new environment
env <- new.env()
# Add an element named perfect
env$perfect <- c(6, 28, 496)
# Add an element named bases
env[["bases"]] <- c("A", "C", "G", "T")
# Assign lst and env
lst <- list(
perfect = c(6, 28, 496),
bases = c("A", "C", "G", "T")
)
env <- list2env(lst)
# Copy lst
lst2 <- lst
# Change lst's bases element
lst$bases <- c("A", "C", "G", "U")
# Test lst and lst2 identical
identical(lst$bases, lst2$bases)
## [1] FALSE
# Copy env
env2 <- env
# Change env's bases element
env$bases <- c("A", "C", "G", "U")
# Test env and env2 identical
identical(env$bases, env2$bases)
## [1] TRUE
# Complete the class definition
env_microwave_oven_factory <- R6::R6Class(
"MicrowaveOven",
private = list(
shared = {
# Create a new environment named e
e <- new.env()
# Assign safety_warning into e
e$safety_warning <- "Warning. Do not try to cook metal objects."
# Return e
e
}
),
active = list(
# Add the safety_warning binding
safety_warning = function(value) {
if (missing(value)) {
private$shared$safety_warning
} else {
private$shared$safety_warning <- value
}
}
)
)
# Create two microwave ovens
a_microwave_oven <- env_microwave_oven_factory$new()
another_microwave_oven <- env_microwave_oven_factory$new()
# Change the safety warning for a_microwave_oven
a_microwave_oven$safety_warning <- "Warning. If the food is too hot you may scald yourself."
# Verify that the warning has change for another_microwave
another_microwave_oven$safety_warning
## [1] "Warning. If the food is too hot you may scald yourself."
# Still uses microwave_oven_factory as defined in Chapter 4
# Create a microwave oven
a_microwave_oven <- microwave_oven_factory$new()
# Copy a_microwave_oven using <-
assigned_microwave_oven <- a_microwave_oven
# Copy a_microwave_oven using clone()
cloned_microwave_oven <- a_microwave_oven$clone()
# Change a_microwave_oven's power level
a_microwave_oven$power_level_watts <- 400
# Check a_microwave_oven & assigned_microwave_oven same
identical(a_microwave_oven$power_level_watts, assigned_microwave_oven$power_level_watts)
## [1] TRUE
# Check a_microwave_oven & cloned_microwave_oven different
!identical(a_microwave_oven$power_level_watts, cloned_microwave_oven$power_level_watts)
## [1] TRUE
# Commented, due to never defined power_plug
# Create a microwave oven
# a_microwave_oven <- microwave_oven_factory$new()
# Look at its power plug
# a_microwave_oven$power_plug
# Copy a_microwave_oven using clone(), no args
# cloned_microwave_oven <- a_microwave_oven$clone()
# Copy a_microwave_oven using clone(), deep = TRUE
# deep_cloned_microwave_oven <- a_microwave_oven$clone(deep=TRUE)
# Change a_microwave_oven's power plug type
# a_microwave_oven$power_plug$type <- "British"
# Check a_microwave_oven & cloned_microwave_oven same
# identical(a_microwave_oven$power_plug$type, cloned_microwave_oven$power_plug$type)
# Check a_microwave_oven & deep_cloned_microwave_oven different
# !identical(a_microwave_oven$power_plug$type, deep_cloned_microwave_oven$power_plug$type)
# Commented due to not having this SQL database
# Microwave_factory is pre-defined
# microwave_oven_factory
# Complete the class definition
# smart_microwave_oven_factory <- R6::R6Class(
# "SmartMicrowaveOven",
# inherit = microwave_oven_factory, # Specify inheritance
# private = list(
# conn = NULL
# ),
# public = list(
# initialize = function() {
# # Connect to the database
# private$conn = dbConnect(SQLite(), "cooking-times.sqlite")
# },
# get_cooking_time = function(food) {
# dbGetQuery(
# private$conn,
# sprintf("SELECT time_seconds FROM cooking_times WHERE food = '%s'", food)
# )
# },
# finalize = function() {
# message("Disconnecting from the cooking times database.")
# dbDisconnect(private$conn)
# }
# )
# )
# Create a smart microwave object
# a_smart_microwave <- smart_microwave_oven_factory$new()
# Call the get_cooking_time() method
# a_smart_microwave$get_cooking_time("soup")
# Remove the smart microwave
# rm(a_smart_microwave)
# Force garbage collection
# gc()
A nice introduction to S3 and R6.
The library(dplyr) is a grammar of data manipulation. It is written in C++ so you get the speed of C with the convenience of R. It is in essence the data frame to data frame portion of plyr (plyr was the original Split-Apply-Combine). May want to look in to count, transmute, and other verbs added post this summary.
The examples use data(hflights) from library(hflights):
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.2.5
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:purrr':
##
## contains, order_by
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(hflights)
data(hflights)
head(hflights)
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## 5424 2011 1 1 6 1400 1500 AA
## 5425 2011 1 2 7 1401 1501 AA
## 5426 2011 1 3 1 1352 1502 AA
## 5427 2011 1 4 2 1403 1513 AA
## 5428 2011 1 5 3 1405 1507 AA
## 5429 2011 1 6 4 1359 1503 AA
## FlightNum TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin
## 5424 428 N576AA 60 40 -10 0 IAH
## 5425 428 N557AA 60 45 -9 1 IAH
## 5426 428 N541AA 70 48 -8 -8 IAH
## 5427 428 N403AA 70 39 3 3 IAH
## 5428 428 N492AA 62 44 -3 5 IAH
## 5429 428 N262AA 64 45 -7 -1 IAH
## Dest Distance TaxiIn TaxiOut Cancelled CancellationCode Diverted
## 5424 DFW 224 7 13 0 0
## 5425 DFW 224 6 9 0 0
## 5426 DFW 224 5 17 0 0
## 5427 DFW 224 9 22 0 0
## 5428 DFW 224 9 9 0 0
## 5429 DFW 224 6 13 0 0
summary(hflights)
## Year Month DayofMonth DayOfWeek
## Min. :2011 Min. : 1.000 Min. : 1.00 Min. :1.000
## 1st Qu.:2011 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.:2.000
## Median :2011 Median : 7.000 Median :16.00 Median :4.000
## Mean :2011 Mean : 6.514 Mean :15.74 Mean :3.948
## 3rd Qu.:2011 3rd Qu.: 9.000 3rd Qu.:23.00 3rd Qu.:6.000
## Max. :2011 Max. :12.000 Max. :31.00 Max. :7.000
##
## DepTime ArrTime UniqueCarrier FlightNum
## Min. : 1 Min. : 1 Length:227496 Min. : 1
## 1st Qu.:1021 1st Qu.:1215 Class :character 1st Qu.: 855
## Median :1416 Median :1617 Mode :character Median :1696
## Mean :1396 Mean :1578 Mean :1962
## 3rd Qu.:1801 3rd Qu.:1953 3rd Qu.:2755
## Max. :2400 Max. :2400 Max. :7290
## NA's :2905 NA's :3066
## TailNum ActualElapsedTime AirTime ArrDelay
## Length:227496 Min. : 34.0 Min. : 11.0 Min. :-70.000
## Class :character 1st Qu.: 77.0 1st Qu.: 58.0 1st Qu.: -8.000
## Mode :character Median :128.0 Median :107.0 Median : 0.000
## Mean :129.3 Mean :108.1 Mean : 7.094
## 3rd Qu.:165.0 3rd Qu.:141.0 3rd Qu.: 11.000
## Max. :575.0 Max. :549.0 Max. :978.000
## NA's :3622 NA's :3622 NA's :3622
## DepDelay Origin Dest Distance
## Min. :-33.000 Length:227496 Length:227496 Min. : 79.0
## 1st Qu.: -3.000 Class :character Class :character 1st Qu.: 376.0
## Median : 0.000 Mode :character Mode :character Median : 809.0
## Mean : 9.445 Mean : 787.8
## 3rd Qu.: 9.000 3rd Qu.:1042.0
## Max. :981.000 Max. :3904.0
## NA's :2905
## TaxiIn TaxiOut Cancelled CancellationCode
## Min. : 1.000 Min. : 1.00 Min. :0.00000 Length:227496
## 1st Qu.: 4.000 1st Qu.: 10.00 1st Qu.:0.00000 Class :character
## Median : 5.000 Median : 14.00 Median :0.00000 Mode :character
## Mean : 6.099 Mean : 15.09 Mean :0.01307
## 3rd Qu.: 7.000 3rd Qu.: 18.00 3rd Qu.:0.00000
## Max. :165.000 Max. :163.00 Max. :1.00000
## NA's :3066 NA's :2947
## Diverted
## Min. :0.000000
## 1st Qu.:0.000000
## Median :0.000000
## Mean :0.002853
## 3rd Qu.:0.000000
## Max. :1.000000
##
The “tbl” is a special type of data frame, which is very helpful for printing:
An interesting way to do a lookup table:
See for example:
lut <- c("AA" = "American", "AS" = "Alaska", "B6" = "JetBlue", "CO" = "Continental",
"DL" = "Delta", "OO" = "SkyWest", "UA" = "United", "US" = "US_Airways",
"WN" = "Southwest", "EV" = "Atlantic_Southeast", "F9" = "Frontier",
"FL" = "AirTran", "MQ" = "American_Eagle", "XE" = "ExpressJet", "YV" = "Mesa"
)
hflights$Carrier <- lut[hflights$UniqueCarrier]
glimpse(hflights)
## Observations: 227,496
## Variables: 22
## $ Year <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 20...
## $ Month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ DayofMonth <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1...
## $ DayOfWeek <int> 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6,...
## $ DepTime <int> 1400, 1401, 1352, 1403, 1405, 1359, 1359, 13...
## $ ArrTime <int> 1500, 1501, 1502, 1513, 1507, 1503, 1509, 14...
## $ UniqueCarrier <chr> "AA", "AA", "AA", "AA", "AA", "AA", "AA", "A...
## $ FlightNum <int> 428, 428, 428, 428, 428, 428, 428, 428, 428,...
## $ TailNum <chr> "N576AA", "N557AA", "N541AA", "N403AA", "N49...
## $ ActualElapsedTime <int> 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, ...
## $ AirTime <int> 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, ...
## $ ArrDelay <int> -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29,...
## $ DepDelay <int> 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, ...
## $ Origin <chr> "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "I...
## $ Dest <chr> "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "D...
## $ Distance <int> 224, 224, 224, 224, 224, 224, 224, 224, 224,...
## $ TaxiIn <int> 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6...
## $ TaxiOut <int> 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11...
## $ Cancelled <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ CancellationCode <chr> "", "", "", "", "", "", "", "", "", "", "", ...
## $ Diverted <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Carrier <chr> "American", "American", "American", "America...
There are five main verbs in dplyr:
There is also the group_by capability for summaries of sub-groups:
The dplyr library can also work with databases. It only loads the data that you need, and you do not need to know the relevant SQL code – dplyr writes the SQL code for you.
Basic select and mutate examples include:
data(hflights)
# Make it faster, as well as a prettier printer
hflights <- tbl_df(hflights)
hflights
## # A tibble: 227,496 × 21
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## * <int> <int> <int> <int> <int> <int> <chr>
## 1 2011 1 1 6 1400 1500 AA
## 2 2011 1 2 7 1401 1501 AA
## 3 2011 1 3 1 1352 1502 AA
## 4 2011 1 4 2 1403 1513 AA
## 5 2011 1 5 3 1405 1507 AA
## 6 2011 1 6 4 1359 1503 AA
## 7 2011 1 7 5 1359 1509 AA
## 8 2011 1 8 6 1355 1454 AA
## 9 2011 1 9 7 1443 1554 AA
## 10 2011 1 10 1 1443 1553 AA
## # ... with 227,486 more rows, and 14 more variables: FlightNum <int>,
## # TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## # DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## # Diverted <int>
class(hflights)
## [1] "tbl_df" "tbl" "data.frame"
# Select examples
select(hflights, ActualElapsedTime, AirTime, ArrDelay, DepDelay)
## # A tibble: 227,496 × 4
## ActualElapsedTime AirTime ArrDelay DepDelay
## * <int> <int> <int> <int>
## 1 60 40 -10 0
## 2 60 45 -9 1
## 3 70 48 -8 -8
## 4 70 39 3 3
## 5 62 44 -3 5
## 6 64 45 -7 -1
## 7 70 43 -1 -1
## 8 59 40 -16 -5
## 9 71 41 44 43
## 10 70 45 43 43
## # ... with 227,486 more rows
select(hflights, Origin:Cancelled)
## # A tibble: 227,496 × 6
## Origin Dest Distance TaxiIn TaxiOut Cancelled
## * <chr> <chr> <int> <int> <int> <int>
## 1 IAH DFW 224 7 13 0
## 2 IAH DFW 224 6 9 0
## 3 IAH DFW 224 5 17 0
## 4 IAH DFW 224 9 22 0
## 5 IAH DFW 224 9 9 0
## 6 IAH DFW 224 6 13 0
## 7 IAH DFW 224 12 15 0
## 8 IAH DFW 224 7 12 0
## 9 IAH DFW 224 8 22 0
## 10 IAH DFW 224 6 19 0
## # ... with 227,486 more rows
select(hflights, Year:DayOfWeek, ArrDelay:Diverted)
## # A tibble: 227,496 × 14
## Year Month DayofMonth DayOfWeek ArrDelay DepDelay Origin Dest
## * <int> <int> <int> <int> <int> <int> <chr> <chr>
## 1 2011 1 1 6 -10 0 IAH DFW
## 2 2011 1 2 7 -9 1 IAH DFW
## 3 2011 1 3 1 -8 -8 IAH DFW
## 4 2011 1 4 2 3 3 IAH DFW
## 5 2011 1 5 3 -3 5 IAH DFW
## 6 2011 1 6 4 -7 -1 IAH DFW
## 7 2011 1 7 5 -1 -1 IAH DFW
## 8 2011 1 8 6 -16 -5 IAH DFW
## 9 2011 1 9 7 44 43 IAH DFW
## 10 2011 1 10 1 43 43 IAH DFW
## # ... with 227,486 more rows, and 6 more variables: Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## # Diverted <int>
select(hflights, ends_with("Delay"))
## # A tibble: 227,496 × 2
## ArrDelay DepDelay
## * <int> <int>
## 1 -10 0
## 2 -9 1
## 3 -8 -8
## 4 3 3
## 5 -3 5
## 6 -7 -1
## 7 -1 -1
## 8 -16 -5
## 9 44 43
## 10 43 43
## # ... with 227,486 more rows
select(hflights, UniqueCarrier, ends_with("Num"), starts_with("Cancel"))
## # A tibble: 227,496 × 5
## UniqueCarrier FlightNum TailNum Cancelled CancellationCode
## * <chr> <int> <chr> <int> <chr>
## 1 AA 428 N576AA 0
## 2 AA 428 N557AA 0
## 3 AA 428 N541AA 0
## 4 AA 428 N403AA 0
## 5 AA 428 N492AA 0
## 6 AA 428 N262AA 0
## 7 AA 428 N493AA 0
## 8 AA 428 N477AA 0
## 9 AA 428 N476AA 0
## 10 AA 428 N504AA 0
## # ... with 227,486 more rows
select(hflights, ends_with("Time"), ends_with("Delay"))
## # A tibble: 227,496 × 6
## DepTime ArrTime ActualElapsedTime AirTime ArrDelay DepDelay
## * <int> <int> <int> <int> <int> <int>
## 1 1400 1500 60 40 -10 0
## 2 1401 1501 60 45 -9 1
## 3 1352 1502 70 48 -8 -8
## 4 1403 1513 70 39 3 3
## 5 1405 1507 62 44 -3 5
## 6 1359 1503 64 45 -7 -1
## 7 1359 1509 70 43 -1 -1
## 8 1355 1454 59 40 -16 -5
## 9 1443 1554 71 41 44 43
## 10 1443 1553 70 45 43 43
## # ... with 227,486 more rows
# Mutate example
m1 <- mutate(hflights, loss = ArrDelay - DepDelay, loss_ratio = loss / DepDelay)
class(m1)
## [1] "tbl_df" "tbl" "data.frame"
m1
## # A tibble: 227,496 × 23
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## <int> <int> <int> <int> <int> <int> <chr>
## 1 2011 1 1 6 1400 1500 AA
## 2 2011 1 2 7 1401 1501 AA
## 3 2011 1 3 1 1352 1502 AA
## 4 2011 1 4 2 1403 1513 AA
## 5 2011 1 5 3 1405 1507 AA
## 6 2011 1 6 4 1359 1503 AA
## 7 2011 1 7 5 1359 1509 AA
## 8 2011 1 8 6 1355 1454 AA
## 9 2011 1 9 7 1443 1554 AA
## 10 2011 1 10 1 1443 1553 AA
## # ... with 227,486 more rows, and 16 more variables: FlightNum <int>,
## # TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## # DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## # Diverted <int>, loss <int>, loss_ratio <dbl>
glimpse(m1)
## Observations: 227,496
## Variables: 23
## $ Year <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 20...
## $ Month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ DayofMonth <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1...
## $ DayOfWeek <int> 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6,...
## $ DepTime <int> 1400, 1401, 1352, 1403, 1405, 1359, 1359, 13...
## $ ArrTime <int> 1500, 1501, 1502, 1513, 1507, 1503, 1509, 14...
## $ UniqueCarrier <chr> "AA", "AA", "AA", "AA", "AA", "AA", "AA", "A...
## $ FlightNum <int> 428, 428, 428, 428, 428, 428, 428, 428, 428,...
## $ TailNum <chr> "N576AA", "N557AA", "N541AA", "N403AA", "N49...
## $ ActualElapsedTime <int> 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, ...
## $ AirTime <int> 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, ...
## $ ArrDelay <int> -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29,...
## $ DepDelay <int> 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, ...
## $ Origin <chr> "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "I...
## $ Dest <chr> "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "D...
## $ Distance <int> 224, 224, 224, 224, 224, 224, 224, 224, 224,...
## $ TaxiIn <int> 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6...
## $ TaxiOut <int> 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11...
## $ Cancelled <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ CancellationCode <chr> "", "", "", "", "", "", "", "", "", "", "", ...
## $ Diverted <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ loss <int> -10, -10, 0, 0, -8, -6, 0, -11, 1, 0, 0, -14...
## $ loss_ratio <dbl> -Inf, -10.00000000, 0.00000000, 0.00000000, ...
Additionally, examples for filter and arrange:
# Examples for filter
filter(hflights, Distance >= 3000) # All flights that traveled 3000 miles or more
## # A tibble: 527 × 21
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## <int> <int> <int> <int> <int> <int> <chr>
## 1 2011 1 31 1 924 1413 CO
## 2 2011 1 30 7 925 1410 CO
## 3 2011 1 29 6 1045 1445 CO
## 4 2011 1 28 5 1516 1916 CO
## 5 2011 1 27 4 950 1344 CO
## 6 2011 1 26 3 944 1350 CO
## 7 2011 1 25 2 924 1337 CO
## 8 2011 1 24 1 1144 1605 CO
## 9 2011 1 23 7 926 1335 CO
## 10 2011 1 22 6 942 1340 CO
## # ... with 517 more rows, and 14 more variables: FlightNum <int>,
## # TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## # DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## # Diverted <int>
filter(hflights, UniqueCarrier %in% c("B6", "WN", "DL"))
## # A tibble: 48,679 × 21
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## <int> <int> <int> <int> <int> <int> <chr>
## 1 2011 1 1 6 654 1124 B6
## 2 2011 1 1 6 1639 2110 B6
## 3 2011 1 2 7 703 1113 B6
## 4 2011 1 2 7 1604 2040 B6
## 5 2011 1 3 1 659 1100 B6
## 6 2011 1 3 1 1801 2200 B6
## 7 2011 1 4 2 654 1103 B6
## 8 2011 1 4 2 1608 2034 B6
## 9 2011 1 5 3 700 1103 B6
## 10 2011 1 5 3 1544 1954 B6
## # ... with 48,669 more rows, and 14 more variables: FlightNum <int>,
## # TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## # DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## # Diverted <int>
filter(hflights, (TaxiIn + TaxiOut) > AirTime) # Flights where taxiing took longer than flying
## # A tibble: 1,389 × 21
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## <int> <int> <int> <int> <int> <int> <chr>
## 1 2011 1 24 1 731 904 AA
## 2 2011 1 30 7 1959 2132 AA
## 3 2011 1 24 1 1621 1749 AA
## 4 2011 1 10 1 941 1113 AA
## 5 2011 1 31 1 1301 1356 CO
## 6 2011 1 31 1 2113 2215 CO
## 7 2011 1 31 1 1434 1539 CO
## 8 2011 1 31 1 900 1006 CO
## 9 2011 1 30 7 1304 1408 CO
## 10 2011 1 30 7 2004 2128 CO
## # ... with 1,379 more rows, and 14 more variables: FlightNum <int>,
## # TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## # DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## # Diverted <int>
filter(hflights, DepTime < 500 | ArrTime > 2200) # Flights departed before 5am or arrived after 10pm
## # A tibble: 27,799 × 21
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## <int> <int> <int> <int> <int> <int> <chr>
## 1 2011 1 4 2 2100 2207 AA
## 2 2011 1 14 5 2119 2229 AA
## 3 2011 1 10 1 1934 2235 AA
## 4 2011 1 26 3 1905 2211 AA
## 5 2011 1 30 7 1856 2209 AA
## 6 2011 1 9 7 1938 2228 AS
## 7 2011 1 31 1 1919 2231 CO
## 8 2011 1 31 1 2116 2344 CO
## 9 2011 1 31 1 1850 2211 CO
## 10 2011 1 31 1 2102 2216 CO
## # ... with 27,789 more rows, and 14 more variables: FlightNum <int>,
## # TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## # DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## # Diverted <int>
filter(hflights, DepDelay > 0, ArrDelay < 0) # Flights that departed late but arrived ahead of schedule
## # A tibble: 27,712 × 21
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## <int> <int> <int> <int> <int> <int> <chr>
## 1 2011 1 2 7 1401 1501 AA
## 2 2011 1 5 3 1405 1507 AA
## 3 2011 1 18 2 1408 1508 AA
## 4 2011 1 18 2 721 827 AA
## 5 2011 1 12 3 2015 2113 AA
## 6 2011 1 13 4 2020 2116 AA
## 7 2011 1 26 3 2009 2103 AA
## 8 2011 1 1 6 1631 1736 AA
## 9 2011 1 10 1 1639 1740 AA
## 10 2011 1 12 3 1631 1739 AA
## # ... with 27,702 more rows, and 14 more variables: FlightNum <int>,
## # TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## # DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## # Diverted <int>
filter(hflights, Cancelled == 1, DepDelay > 0) # Flights that were cancelled after being delayed
## # A tibble: 40 × 21
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## <int> <int> <int> <int> <int> <int> <chr>
## 1 2011 1 26 3 1926 NA CO
## 2 2011 1 11 2 1100 NA US
## 3 2011 1 19 3 1811 NA XE
## 4 2011 1 7 5 2028 NA XE
## 5 2011 2 4 5 1638 NA AA
## 6 2011 2 8 2 1057 NA CO
## 7 2011 2 2 3 802 NA XE
## 8 2011 2 9 3 904 NA XE
## 9 2011 2 1 2 1508 NA OO
## 10 2011 3 31 4 1016 NA CO
## # ... with 30 more rows, and 14 more variables: FlightNum <int>,
## # TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## # DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## # Diverted <int>
c1 <- filter(hflights, Dest == "JFK") # Flights that had JFK as their destination: c1
c2 <- mutate(c1, Date = paste(Year, Month, DayofMonth, sep="-")) # Create a Date column: c2
select(c2, Date, DepTime, ArrTime, TailNum) # Print out a selection of columns of c2
## # A tibble: 695 × 4
## Date DepTime ArrTime TailNum
## <chr> <int> <int> <chr>
## 1 2011-1-1 654 1124 N324JB
## 2 2011-1-1 1639 2110 N324JB
## 3 2011-1-2 703 1113 N324JB
## 4 2011-1-2 1604 2040 N324JB
## 5 2011-1-3 659 1100 N229JB
## 6 2011-1-3 1801 2200 N206JB
## 7 2011-1-4 654 1103 N267JB
## 8 2011-1-4 1608 2034 N267JB
## 9 2011-1-5 700 1103 N708JB
## 10 2011-1-5 1544 1954 N644JB
## # ... with 685 more rows
dtc <- filter(hflights, Cancelled == 1, !is.na(DepDelay)) # Definition of dtc
# Examples for arrange
arrange(dtc, DepDelay) # Arrange dtc by departure delays
## # A tibble: 68 × 21
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## <int> <int> <int> <int> <int> <int> <chr>
## 1 2011 7 23 6 605 NA F9
## 2 2011 1 17 1 916 NA XE
## 3 2011 12 1 4 541 NA US
## 4 2011 10 12 3 2022 NA MQ
## 5 2011 7 29 5 1424 NA CO
## 6 2011 9 29 4 1639 NA OO
## 7 2011 2 9 3 555 NA MQ
## 8 2011 5 9 1 715 NA OO
## 9 2011 1 20 4 1413 NA UA
## 10 2011 1 17 1 831 NA WN
## # ... with 58 more rows, and 14 more variables: FlightNum <int>,
## # TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## # DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## # Diverted <int>
arrange(dtc, CancellationCode) # Arrange dtc so that cancellation reasons are grouped
## # A tibble: 68 × 21
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## <int> <int> <int> <int> <int> <int> <chr>
## 1 2011 1 20 4 1413 NA UA
## 2 2011 1 7 5 2028 NA XE
## 3 2011 2 4 5 1638 NA AA
## 4 2011 2 8 2 1057 NA CO
## 5 2011 2 1 2 1508 NA OO
## 6 2011 2 21 1 2257 NA OO
## 7 2011 2 9 3 555 NA MQ
## 8 2011 3 18 5 727 NA UA
## 9 2011 4 4 1 1632 NA DL
## 10 2011 4 8 5 1608 NA WN
## # ... with 58 more rows, and 14 more variables: FlightNum <int>,
## # TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## # DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## # Diverted <int>
arrange(dtc, UniqueCarrier, DepDelay) # Arrange dtc according to carrier and departure delays
## # A tibble: 68 × 21
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## <int> <int> <int> <int> <int> <int> <chr>
## 1 2011 8 18 4 1808 NA AA
## 2 2011 2 4 5 1638 NA AA
## 3 2011 7 29 5 1424 NA CO
## 4 2011 1 26 3 1703 NA CO
## 5 2011 8 11 4 1320 NA CO
## 6 2011 7 25 1 1654 NA CO
## 7 2011 1 26 3 1926 NA CO
## 8 2011 3 31 4 1016 NA CO
## 9 2011 2 8 2 1057 NA CO
## 10 2011 4 4 1 1632 NA DL
## # ... with 58 more rows, and 14 more variables: FlightNum <int>,
## # TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## # DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## # Diverted <int>
arrange(hflights, UniqueCarrier, desc(DepDelay)) # Arrange by carrier and decreasing departure delays
## # A tibble: 227,496 × 21
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## <int> <int> <int> <int> <int> <int> <chr>
## 1 2011 12 12 1 650 808 AA
## 2 2011 11 19 6 1752 1910 AA
## 3 2011 12 22 4 1728 1848 AA
## 4 2011 10 23 7 2305 2 AA
## 5 2011 9 27 2 1206 1300 AA
## 6 2011 3 17 4 1647 1747 AA
## 7 2011 6 21 2 955 1315 AA
## 8 2011 5 20 5 2359 130 AA
## 9 2011 4 19 2 2023 2142 AA
## 10 2011 5 12 4 2133 53 AA
## # ... with 227,486 more rows, and 14 more variables: FlightNum <int>,
## # TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## # DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## # Diverted <int>
arrange(hflights, DepDelay + ArrDelay) # Arrange flights by total delay (normal order)
## # A tibble: 227,496 × 21
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## <int> <int> <int> <int> <int> <int> <chr>
## 1 2011 7 3 7 1914 2039 XE
## 2 2011 8 31 3 934 1039 OO
## 3 2011 8 21 7 935 1039 OO
## 4 2011 8 28 7 2059 2206 OO
## 5 2011 8 29 1 935 1041 OO
## 6 2011 12 25 7 741 926 OO
## 7 2011 1 30 7 620 812 OO
## 8 2011 8 3 3 1741 1810 XE
## 9 2011 8 4 4 930 1041 OO
## 10 2011 8 18 4 939 1043 OO
## # ... with 227,486 more rows, and 14 more variables: FlightNum <int>,
## # TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## # DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## # Diverted <int>
Additionally, examples for the summarize verb:
# Print out a summary with variables min_dist and max_dist
summarize(hflights, min_dist = min(Distance), max_dist = max(Distance))
## # A tibble: 1 × 2
## min_dist max_dist
## <int> <int>
## 1 79 3904
# Print out a summary with variable max_div
summarize(filter(hflights, Diverted == 1), max_div = max(Distance))
## # A tibble: 1 × 1
## max_div
## <int>
## 1 3904
# Remove rows that have NA ArrDelay: temp1
temp1 <- filter(hflights, !is.na(ArrDelay))
# Generate summary about ArrDelay column of temp1
summarize(temp1, earliest=min(ArrDelay), average=mean(ArrDelay), latest=max(ArrDelay), sd=sd(ArrDelay))
## # A tibble: 1 × 4
## earliest average latest sd
## <int> <dbl> <int> <dbl>
## 1 -70 7.094334 978 30.70852
# Keep rows that have no NA TaxiIn and no NA TaxiOut: temp2
temp2 <- filter(hflights, !is.na(TaxiIn), !is.na(TaxiOut))
# Print the maximum taxiing difference of temp2 with summarise()
summarize(temp2, max_taxi_diff = max(abs(TaxiIn - TaxiOut)))
## # A tibble: 1 × 1
## max_taxi_diff
## <int>
## 1 160
# Generate summarizing statistics for hflights
summarize(hflights, n_obs = n(), n_carrier = n_distinct(UniqueCarrier), n_dest = n_distinct(Dest))
## # A tibble: 1 × 3
## n_obs n_carrier n_dest
## <int> <int> <int>
## 1 227496 15 116
# All American Airline flights
aa <- filter(hflights, UniqueCarrier == "AA")
# Generate summarizing statistics for aa
summarize(aa, n_flights = n(), n_canc = sum(Cancelled), avg_delay = mean(ArrDelay, na.rm=TRUE))
## # A tibble: 1 × 3
## n_flights n_canc avg_delay
## <int> <int> <dbl>
## 1 3244 60 0.8917558
Additionally, examples for the pipe/chain as per magrittr:
# Find the average delta in taxi times
hflights %>%
mutate(diff = (TaxiOut - TaxiIn)) %>%
filter(!is.na(diff)) %>%
summarize(avg = mean(diff))
## # A tibble: 1 × 1
## avg
## <dbl>
## 1 8.992064
# Find flights that average less than 70 mph assuming 100 wasted minutes per flight
hflights %>%
mutate(RealTime = ActualElapsedTime + 100, mph = 60 * Distance / RealTime) %>%
filter(!is.na(mph), mph < 70) %>%
summarize(n_less = n(), n_dest = n_distinct(Dest), min_dist = min(Distance), max_dist = max(Distance))
## # A tibble: 1 × 4
## n_less n_dest min_dist max_dist
## <int> <int> <int> <int>
## 1 6726 13 79 305
# Find flights that average less than 105 mph, or that are diverted/cancelled
hflights %>%
mutate(RealTime = ActualElapsedTime + 100, mph = Distance / RealTime * 60) %>%
filter(mph < 105 | Cancelled == 1 | Diverted == 1) %>%
summarize(n_non = n(), n_dest = n_distinct(Dest), min_dist = min(Distance), max_dist = max(Distance))
## # A tibble: 1 × 4
## n_non n_dest min_dist max_dist
## <int> <int> <int> <int>
## 1 42400 113 79 3904
# Find overnight flights
filter(hflights, !is.na(DepTime), !is.na(ArrTime), DepTime > ArrTime) %>%
summarize(num = n())
## # A tibble: 1 × 1
## num
## <int>
## 1 2718
There is also the group_by capability, typically for use with summarize:
# Make an ordered per-carrier summary of hflights
group_by(hflights, UniqueCarrier) %>%
summarize(p_canc = 100 * mean(Cancelled, na.rm=TRUE), avg_delay = mean(ArrDelay, na.rm=TRUE)) %>%
arrange(avg_delay, p_canc)
## # A tibble: 15 × 3
## UniqueCarrier p_canc avg_delay
## <chr> <dbl> <dbl>
## 1 US 1.1268986 -0.6307692
## 2 AA 1.8495684 0.8917558
## 3 FL 0.9817672 1.8536239
## 4 AS 0.0000000 3.1923077
## 5 YV 1.2658228 4.0128205
## 6 DL 1.5903067 6.0841374
## 7 CO 0.6782614 6.0986983
## 8 MQ 2.9044750 7.1529751
## 9 EV 3.4482759 7.2569543
## 10 WN 1.5504047 7.5871430
## 11 F9 0.7159905 7.6682692
## 12 XE 1.5495599 8.1865242
## 13 OO 1.3946828 8.6934922
## 14 B6 2.5899281 9.8588410
## 15 UA 1.6409266 10.4628628
# Ordered overview of average arrival delays per carrier
hflights %>%
filter(!is.na(ArrDelay), ArrDelay > 0) %>%
group_by(UniqueCarrier) %>%
summarize(avg = mean(ArrDelay)) %>%
mutate(rank = rank(avg)) %>%
arrange(rank)
## # A tibble: 15 × 3
## UniqueCarrier avg rank
## <chr> <dbl> <dbl>
## 1 YV 18.67568 1
## 2 F9 18.68683 2
## 3 US 20.70235 3
## 4 CO 22.13374 4
## 5 AS 22.91195 5
## 6 OO 24.14663 6
## 7 XE 24.19337 7
## 8 WN 25.27750 8
## 9 FL 27.85693 9
## 10 AA 28.49740 10
## 11 DL 32.12463 11
## 12 UA 32.48067 12
## 13 MQ 38.75135 13
## 14 EV 40.24231 14
## 15 B6 45.47744 15
# How many airplanes only flew to one destination?
hflights %>%
group_by(TailNum) %>%
summarise(destPerTail = n_distinct(Dest)) %>%
filter(destPerTail == 1) %>%
summarise(nplanes=n())
## # A tibble: 1 × 1
## nplanes
## <int>
## 1 1526
# Find the most visited destination for each carrier
hflights %>%
group_by(UniqueCarrier, Dest) %>%
summarise(n = n()) %>%
mutate(rank = rank(-n)) %>%
filter(rank == 1)
## Source: local data frame [15 x 4]
## Groups: UniqueCarrier [15]
##
## UniqueCarrier Dest n rank
## <chr> <chr> <int> <dbl>
## 1 AA DFW 2105 1
## 2 AS SEA 365 1
## 3 B6 JFK 695 1
## 4 CO EWR 3924 1
## 5 DL ATL 2396 1
## 6 EV DTW 851 1
## 7 F9 DEN 837 1
## 8 FL ATL 2029 1
## 9 MQ DFW 2424 1
## 10 OO COS 1335 1
## 11 UA SFO 643 1
## 12 US CLT 2212 1
## 13 WN DAL 8243 1
## 14 XE CRP 3175 1
## 15 YV CLT 71 1
# Use summarise to calculate n_carrier
library(data.table)
## -------------------------------------------------------------------------
## data.table + dplyr code now lives in dtplyr.
## Please library(dtplyr)!
## -------------------------------------------------------------------------
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, last
## The following object is masked from 'package:purrr':
##
## transpose
hflights2 <- as.data.table(hflights)
hflights2 %>%
summarize(n_carrier = n_distinct(UniqueCarrier))
## n_carrier
## 1 15
And, dplyr can be used with databases, including writing the SQL query that matches to the dplyr request. The results are cached to avoid constantly pinging the server:
# Set up a connection to the mysql database
my_db <- src_mysql(dbname = "dplyr",
host = "courses.csrrinzqubik.us-east-1.rds.amazonaws.com",
port = 3306,
user = "student",
password = "datacamp")
# Reference a table within that source: nycflights
nycflights <- tbl(my_db, "dplyr")
# glimpse at nycflights
glimpse(nycflights)
## Observations: 336,776
## Variables: 17
## $ id (int) 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1...
## $ year (int) 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013...
## $ month (int) 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ day (int) 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ dep_time (int) 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 55...
## $ dep_delay (int) 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2,...
## $ arr_time (int) 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 8...
## $ arr_delay (int) 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7,...
## $ carrier (chr) "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6"...
## $ tailnum (chr) "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N...
## $ flight (int) 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301...
## $ origin (chr) "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LG...
## $ dest (chr) "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IA...
## $ air_time (int) 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149...
## $ distance (int) 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 73...
## $ hour (int) 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6...
## $ minute (int) 17, 33, 42, 44, 54, 54, 55, 57, 57, 58, 58, 58, 58, ...
# Ordered, grouped summary of nycflights
nycflights %>%
group_by(carrier) %>%
summarize(n_flights = n(), avg_delay = mean(arr_delay)) %>%
arrange(avg_delay)
## Source: mysql 5.6.23-log [student@courses.csrrinzqubik.us-east-1.rds.amazonaws.com:/dplyr]
## From: <derived table> [?? x 3]
## Arrange: avg_delay
## Warning in .local(conn, statement, ...): Decimal MySQL column 2 imported as
## numeric
## carrier n_flights avg_delay
## (chr) (dbl) (dbl)
## 1 AS 714 -9.8613
## 2 HA 342 -6.9152
## 3 AA 32729 0.3556
## 4 DL 48110 1.6289
## 5 VX 5162 1.7487
## 6 US 20536 2.0565
## 7 UA 58665 3.5045
## 8 9E 18460 6.9135
## 9 B6 54635 9.3565
## 10 WN 12275 9.4675
## .. ... ... ...
Overall Course Overview - Goal is to have data in a single, tidy table
However, real-world data is typically split across multiple tables; this course will be about handling that:
Builds on the above course about basic dplyr. More than one way to handle things, as is common in R; base::merge() has some similar functions, however:
Chapter 1 - Mutating Joins
Keys are the columns that are “matched” between datasets that are being joined:
Joins can be run in several manners:
Variations on joins - the left_join and right_join are “mutating joins”, which is to say that they return a copy of the “primary” data with columns added as appropriate:
Example code includes:
artFirst <- "Jimmy ; George ; Mick ; Tom ; Davy ; John ; Paul ; Jimmy ; Joe ; Elvis ; Keith ; Paul ; Ringo ; Joe ; Brian ; Nancy"
artLast <- "Buffett ; Harrison ; Jagger ; Jones ; Jones ; Lennon ; McCartney ; Page ; Perry ; Presley ; Richards ; Simon ; Starr ; Walsh ; Wilson ; Wilson"
artInstrument <- "Guitar ; Guitar ; Vocals ; Vocals ; Vocals ; Guitar ; Bass ; Guitar ; Guitar ; Vocals ; Guitar ; Guitar ; Drums ; Guitar ; Vocals ; Vocals"
bandFirst <- "John ; John Paul ; Jimmy ; Robert ; George ; John ; Paul ; Ringo ; Jimmy ; Mick ; Keith ; Charlie ; Ronnie"
bandLast <- "Bonham ; Jones ; Page ; Plant ; Harrison ; Lennon ; McCartney ; Starr ; Buffett ; Jagger ; Richards ; Watts ; Woods"
bandBand <- "Led Zeppelin ; Led Zeppelin ; Led Zeppelin ; Led Zeppelin ; The Beatles ; The Beatles ; The Beatles ; The Beatles ; The Coral Reefers ; The Rolling Stones ; The Rolling Stones ; The Rolling Stones ; The Rolling Stones"
artists <- data.frame( first=strsplit(artFirst, " ; ")[[1]] ,
last=strsplit(artLast, " ; ")[[1]] ,
instrument=strsplit(artInstrument, " ; ")[[1]] ,
stringsAsFactors=FALSE
)
bands <- data.frame( first=strsplit(bandFirst, " ; ")[[1]] ,
last=strsplit(bandLast, " ; ")[[1]] ,
band=strsplit(bandBand, " ; ")[[1]] ,
stringsAsFactors=FALSE
)
library(dplyr)
# Complete the code to join artists to bands
bands2 <- left_join(bands, artists, by = c("first", "last"))
# Examine the results
bands2
## first last band instrument
## 1 John Bonham Led Zeppelin <NA>
## 2 John Paul Jones Led Zeppelin <NA>
## 3 Jimmy Page Led Zeppelin Guitar
## 4 Robert Plant Led Zeppelin <NA>
## 5 George Harrison The Beatles Guitar
## 6 John Lennon The Beatles Guitar
## 7 Paul McCartney The Beatles Bass
## 8 Ringo Starr The Beatles Drums
## 9 Jimmy Buffett The Coral Reefers Guitar
## 10 Mick Jagger The Rolling Stones Vocals
## 11 Keith Richards The Rolling Stones Guitar
## 12 Charlie Watts The Rolling Stones <NA>
## 13 Ronnie Woods The Rolling Stones <NA>
# Note how this would be WRONG even though the code executes fine
left_join(bands, artists, by = c("first"))
## first last.x band last.y instrument
## 1 John Bonham Led Zeppelin Lennon Guitar
## 2 John Paul Jones Led Zeppelin <NA> <NA>
## 3 Jimmy Page Led Zeppelin Buffett Guitar
## 4 Jimmy Page Led Zeppelin Page Guitar
## 5 Robert Plant Led Zeppelin <NA> <NA>
## 6 George Harrison The Beatles Harrison Guitar
## 7 John Lennon The Beatles Lennon Guitar
## 8 Paul McCartney The Beatles McCartney Bass
## 9 Paul McCartney The Beatles Simon Guitar
## 10 Ringo Starr The Beatles Starr Drums
## 11 Jimmy Buffett The Coral Reefers Buffett Guitar
## 12 Jimmy Buffett The Coral Reefers Page Guitar
## 13 Mick Jagger The Rolling Stones Jagger Vocals
## 14 Keith Richards The Rolling Stones Richards Guitar
## 15 Charlie Watts The Rolling Stones <NA> <NA>
## 16 Ronnie Woods The Rolling Stones <NA> <NA>
# Finish the code below to recreate bands3 with a right join
bands2 <- left_join(bands, artists, by = c("first", "last"))
bands3 <- right_join(artists, bands, by = c("first", "last"))
# Check that bands3 is equal to bands2
setequal(bands2, bands3)
## TRUE
songData <- "Come Together : Abbey Road : John : Lennon ; Dream On : Aerosmith : Steven : Tyler ; Hello, Goodbye : Magical Mystery Tour : Paul : McCartney ; It's Not Unusual : Along Came Jones : Tom : Jones"
albumsData <- "A Hard Day's Night : The Beatles : 1964 ; Magical Mystery Tour : The Beatles : 1967 ; Beggar's Banquet : The Rolling Stones : 1968 ; Abbey Road : The Beatles : 1969 ; Led Zeppelin IV : Led Zeppelin : 1971 ; The Dark Side of the Moon : Pink Floyd : 1973 ; Aerosmith : Aerosmith : 1973 ; Rumours : Fleetwood Mac : 1977 ; Hotel California : Eagles : 1982"
songs <- as.data.frame( t(sapply(strsplit(songData, " ; ")[[1]],
FUN=function(x) { strsplit(x, " : ")[[1]] } ,
USE.NAMES=FALSE
)
) , stringsAsFactors=FALSE
)
albums <- as.data.frame( t(sapply(strsplit(albumsData, " ; ")[[1]],
FUN=function(x) { strsplit(x, " : ")[[1]] } ,
USE.NAMES=FALSE
))
, stringsAsFactors=FALSE
)
names(songs) <- c("song", "album", "first", "last")
names(albums) <- c("album", "band", "year")
# Join albums to songs using inner_join()
inner_join(songs, albums, by="album")
## song album first last band year
## 1 Come Together Abbey Road John Lennon The Beatles 1969
## 2 Dream On Aerosmith Steven Tyler Aerosmith 1973
## 3 Hello, Goodbye Magical Mystery Tour Paul McCartney The Beatles 1967
# Join bands to artists using full_join()
full_join(artists, bands, by=c("first", "last"))
## first last instrument band
## 1 Jimmy Buffett Guitar The Coral Reefers
## 2 George Harrison Guitar The Beatles
## 3 Mick Jagger Vocals The Rolling Stones
## 4 Tom Jones Vocals <NA>
## 5 Davy Jones Vocals <NA>
## 6 John Lennon Guitar The Beatles
## 7 Paul McCartney Bass The Beatles
## 8 Jimmy Page Guitar Led Zeppelin
## 9 Joe Perry Guitar <NA>
## 10 Elvis Presley Vocals <NA>
## 11 Keith Richards Guitar The Rolling Stones
## 12 Paul Simon Guitar <NA>
## 13 Ringo Starr Drums The Beatles
## 14 Joe Walsh Guitar <NA>
## 15 Brian Wilson Vocals <NA>
## 16 Nancy Wilson Vocals <NA>
## 17 John Bonham <NA> Led Zeppelin
## 18 John Paul Jones <NA> Led Zeppelin
## 19 Robert Plant <NA> Led Zeppelin
## 20 Charlie Watts <NA> The Rolling Stones
## 21 Ronnie Woods <NA> The Rolling Stones
# Find guitarists in bands dataset (don't change)
temp <- left_join(bands, artists, by = c("first", "last"))
temp <- filter(temp, instrument == "Guitar")
select(temp, first, last, band)
## first last band
## 1 Jimmy Page Led Zeppelin
## 2 George Harrison The Beatles
## 3 John Lennon The Beatles
## 4 Jimmy Buffett The Coral Reefers
## 5 Keith Richards The Rolling Stones
# Reproduce code above using pipes
bands %>%
left_join(artists, by = c("first", "last")) %>%
filter(instrument == "Guitar") %>%
select(first, last, band)
## first last band
## 1 Jimmy Page Led Zeppelin
## 2 George Harrison The Beatles
## 3 John Lennon The Beatles
## 4 Jimmy Buffett The Coral Reefers
## 5 Keith Richards The Rolling Stones
goalData <- "Tom : John : Paul ; Jones : Lennon : McCartney ; Vocals : Guitar : Bass ; NA : The Beatles : The Beatles ; It's Not Unusual : Come Together : Hello, Goodbye ; Along Came Jones : Abbey Road : Magical Mystery Tour"
goal <- as.data.frame( sapply(strsplit(goalData, " ; ")[[1]],
FUN=function(x) { strsplit(x, " : ")[[1]] } ,
USE.NAMES=FALSE
) , stringsAsFactors=FALSE
)
names(goal) <- c("first", "last", "instrument", "band", "song", "album")
goal[goal == "NA"] <- NA # Fix the text that is "NA"
# Examine the contents of the goal dataset
goal
## first last instrument band song
## 1 Tom Jones Vocals <NA> It's Not Unusual
## 2 John Lennon Guitar The Beatles Come Together
## 3 Paul McCartney Bass The Beatles Hello, Goodbye
## album
## 1 Along Came Jones
## 2 Abbey Road
## 3 Magical Mystery Tour
# Create goal2 using full_join() and inner_join()
goal2 <- artists %>%
full_join(bands, by=c("first", "last")) %>%
inner_join(songs, by=c("first", "last"))
# Check that goal and goal2 are the same
setequal(goal, goal2)
## TRUE
sum(goal != goal2, na.rm=TRUE)
## [1] 0
# Create one table that combines all information
artists %>%
full_join(bands, by=c("first", "last")) %>%
full_join(songs, by=c("first", "last")) %>%
full_join(albums, by=c("album", "band"))
## first last instrument band song
## 1 Jimmy Buffett Guitar The Coral Reefers <NA>
## 2 George Harrison Guitar The Beatles <NA>
## 3 Mick Jagger Vocals The Rolling Stones <NA>
## 4 Tom Jones Vocals <NA> It's Not Unusual
## 5 Davy Jones Vocals <NA> <NA>
## 6 John Lennon Guitar The Beatles Come Together
## 7 Paul McCartney Bass The Beatles Hello, Goodbye
## 8 Jimmy Page Guitar Led Zeppelin <NA>
## 9 Joe Perry Guitar <NA> <NA>
## 10 Elvis Presley Vocals <NA> <NA>
## 11 Keith Richards Guitar The Rolling Stones <NA>
## 12 Paul Simon Guitar <NA> <NA>
## 13 Ringo Starr Drums The Beatles <NA>
## 14 Joe Walsh Guitar <NA> <NA>
## 15 Brian Wilson Vocals <NA> <NA>
## 16 Nancy Wilson Vocals <NA> <NA>
## 17 John Bonham <NA> Led Zeppelin <NA>
## 18 John Paul Jones <NA> Led Zeppelin <NA>
## 19 Robert Plant <NA> Led Zeppelin <NA>
## 20 Charlie Watts <NA> The Rolling Stones <NA>
## 21 Ronnie Woods <NA> The Rolling Stones <NA>
## 22 Steven Tyler <NA> <NA> Dream On
## 23 <NA> <NA> <NA> The Beatles <NA>
## 24 <NA> <NA> <NA> The Rolling Stones <NA>
## 25 <NA> <NA> <NA> Led Zeppelin <NA>
## 26 <NA> <NA> <NA> Pink Floyd <NA>
## 27 <NA> <NA> <NA> Aerosmith <NA>
## 28 <NA> <NA> <NA> Fleetwood Mac <NA>
## 29 <NA> <NA> <NA> Eagles <NA>
## album year
## 1 <NA> <NA>
## 2 <NA> <NA>
## 3 <NA> <NA>
## 4 Along Came Jones <NA>
## 5 <NA> <NA>
## 6 Abbey Road 1969
## 7 Magical Mystery Tour 1967
## 8 <NA> <NA>
## 9 <NA> <NA>
## 10 <NA> <NA>
## 11 <NA> <NA>
## 12 <NA> <NA>
## 13 <NA> <NA>
## 14 <NA> <NA>
## 15 <NA> <NA>
## 16 <NA> <NA>
## 17 <NA> <NA>
## 18 <NA> <NA>
## 19 <NA> <NA>
## 20 <NA> <NA>
## 21 <NA> <NA>
## 22 Aerosmith <NA>
## 23 A Hard Day's Night 1964
## 24 Beggar's Banquet 1968
## 25 Led Zeppelin IV 1971
## 26 The Dark Side of the Moon 1973
## 27 Aerosmith 1973
## 28 Rumours 1977
## 29 Hotel California 1982
Chapter 2
Filtering joins return a copy of the primary data frame that has been filtered rather than augmented:
The anti_join() is the opposite of the semi_join() in that it keeps only rows that DO NOT have a match:
Set operations are used when two datasets contain the exact same variables:
Comparing datasets can also be run using setequal():
Example code includes:
# Data sets still available from the previous module
# View the output of semi_join()
artists %>%
semi_join(songs, by = c("first", "last"))
## first last instrument
## 1 John Lennon Guitar
## 2 Paul McCartney Bass
## 3 Tom Jones Vocals
# Create the same result
artists %>%
right_join(songs, by = c("first", "last")) %>%
filter(!is.na(instrument)) %>%
select(first, last, instrument)
## first last instrument
## 1 John Lennon Guitar
## 2 Paul McCartney Bass
## 3 Tom Jones Vocals
albums %>%
# Collect the albums made by a band
semi_join(bands, by="band") %>%
# Count the albums made by a band
nrow()
## [1] 5
# Create data set tracks and matches
trackTrack <- "Can't Buy Me Love ; I Feel Fine ; A Hard Day's Night ; Sound of Silence ; Help! ; Ticket to Ride ; I am a Rock ; Yellow Submarine / Eleanor Rigby ; Homeward Bound ; Scarborough Fair ; Penny Lane ; Strawberry Fields Forever ; Hello, Goodbye ; Ruby Tuesday ; All You Need Is Love ; Hey Jude ; Lady Madonna ; Get Back ; Sympathy for the Devil ; Brown Sugar ; Happy"
trackBand <- "The Beatles ; The Beatles ; The Beatles ; Simon and Garfunkel ; The Beatles ; The Beatles ; Simon and Garfunkel ; The Beatles ; Simon and Garfunkel ; Simon and Garfunkel ; The Beatles ; The Beatles ; The Beatles ; The Rolling Stones ; The Beatles ; The Beatles ; The Beatles ; The Beatles ; The Rolling Stones ; The Rolling Stones ; The Rolling Stones"
trackLabel <- "Parlophone ; Parlophone ; Parlophone ; Columbia ; Parlophone ; Parlophone ; Columbia ; Parlophone ; Columbia ; Columbia ; Parlophone ; Parlophone ; Parlophone ; Decca ; Parlophone ; Apple ; Parlophone ; Apple ; Decca ; Rolling Stones Records ; Rolling Stones Records"
trackYear <- "1964 ; 1964 ; 1964 ; 1964 ; 1965 ; 1965 ; 1965 ; 1966 ; 1966 ; 1966 ; 1967 ; 1967 ; 1967 ; 1967 ; 1967 ; 1968 ; 1968 ; 1969 ; 1969 ; 1971 ; 1972"
trackFirst <- "Paul ; John ; John ; Paul ; John ; John ; Paul ; Paul ; Paul ; unknown ; Paul ; John ; Paul ; Keith ; John ; Paul ; Paul ; Paul ; Mick ; Mick ; Keith"
trackLast <- "McCartney ; Lennon ; Lennon ; Simon ; Lennon ; Lennon ; Simon ; McCartney ; Simon ; unknown ; McCartney ; Lennon ; McCartney ; Richards ; Lennon ; McCartney ; McCartney ; McCartney ; Jagger ; Jagger ; Richards"
tracks <- data.frame(track=strsplit(trackTrack, " ; ")[[1]],
band=strsplit(trackBand, " ; ")[[1]],
label=strsplit(trackLabel, " ; ")[[1]],
year=as.integer(strsplit(trackYear, " ; ")[[1]]),
first=strsplit(trackFirst, " ; ")[[1]],
last=strsplit(trackLast, " ; ")[[1]],
stringsAsFactors = FALSE
)
matches <- data.frame(band=c("The Beatles", "The Beatles", "Simon and Garfunkel"),
year=c(1964L, 1965L, 1966L),
first=c("Paul", "John", "Paul"),
stringsAsFactors=FALSE
)
# Comparison of effort required
tracks %>% semi_join(
matches,
by = c("band", "year", "first")
)
## track band label year first last
## 1 Can't Buy Me Love The Beatles Parlophone 1964 Paul McCartney
## 2 Help! The Beatles Parlophone 1965 John Lennon
## 3 Ticket to Ride The Beatles Parlophone 1965 John Lennon
## 4 Homeward Bound Simon and Garfunkel Columbia 1966 Paul Simon
tracks %>% filter(
(band == "The Beatles" &
year == 1964 & first == "Paul") |
(band == "The Beatles" &
year == 1965 & first == "John") |
(band == "Simon and Garfunkel" &
year == 1966 & first == "Paul")
)
## track band label year first last
## 1 Can't Buy Me Love The Beatles Parlophone 1964 Paul McCartney
## 2 Help! The Beatles Parlophone 1965 John Lennon
## 3 Ticket to Ride The Beatles Parlophone 1965 John Lennon
## 4 Homeward Bound Simon and Garfunkel Columbia 1966 Paul Simon
# Return rows of artists that don't have bands info
artists %>%
anti_join(bands, by=c("first", "last"))
## first last instrument
## 1 Elvis Presley Vocals
## 2 Brian Wilson Vocals
## 3 Nancy Wilson Vocals
## 4 Tom Jones Vocals
## 5 Davy Jones Vocals
## 6 Paul Simon Guitar
## 7 Joe Walsh Guitar
## 8 Joe Perry Guitar
# Return rows of artists that don't have bands info
artists %>%
anti_join(bands, by=c("first", "last"))
## first last instrument
## 1 Elvis Presley Vocals
## 2 Brian Wilson Vocals
## 3 Nancy Wilson Vocals
## 4 Tom Jones Vocals
## 5 Davy Jones Vocals
## 6 Paul Simon Guitar
## 7 Joe Walsh Guitar
## 8 Joe Perry Guitar
albumMyLabel <- "Abbey Road ; A Hard Days Night ; Magical Mystery Tour ; Led Zeppelin IV ; The Dark Side of the Moon ; Hotel California ; Rumours ; Aerosmith ; Beggar's Banquet"
labelMyLabel <- "Apple ; Parlophone ; Parlophone ; Atlantic ; Harvest ; Asylum ; Warner Brothers ; Columbia ; Decca"
myLabels <- data.frame(album=strsplit(albumMyLabel, " ; ")[[1]],
label=strsplit(labelMyLabel, " ; ")[[1]],
stringsAsFactors=FALSE
)
# Check whether album names in labels are mis-entered
myLabels %>%
anti_join(albums, by="album")
## album label
## 1 A Hard Days Night Parlophone
# Determine which key joins labels and songs
myLabels
## album label
## 1 Abbey Road Apple
## 2 A Hard Days Night Parlophone
## 3 Magical Mystery Tour Parlophone
## 4 Led Zeppelin IV Atlantic
## 5 The Dark Side of the Moon Harvest
## 6 Hotel California Asylum
## 7 Rumours Warner Brothers
## 8 Aerosmith Columbia
## 9 Beggar's Banquet Decca
songs
## song album first last
## 1 Come Together Abbey Road John Lennon
## 2 Dream On Aerosmith Steven Tyler
## 3 Hello, Goodbye Magical Mystery Tour Paul McCartney
## 4 It's Not Unusual Along Came Jones Tom Jones
# Check your understanding
songs %>%
# Find the rows of songs that match a row in labels
semi_join(myLabels, by="album") %>%
# Number of matches between labels and songs
nrow()
## [1] 3
songAerosmith <- "Make It ; Somebody ; Dream On ; One Way Street ; Mama Kin ; Write me a Letter ; Moving Out ; Walking the Dog"
lengthAerosmith <- "13260 ; 13500 ; 16080 ; 25200 ; 15900 ; 15060 ; 18180 ; 11520"
songGreatestHits <- "Dream On ; Mama Kin ; Same Old Song and Dance ; Seasons of Winter ; Sweet Emotion ; Walk this Way ; Big Ten Inch Record ; Last Child ; Back in the Saddle ; Draw the Line ; Kings and Queens ; Come Together ; Remember (Walking in the Sand) ; Lightning Strikes ; Chip Away the Stone ; Sweet Emotion (remix) ; One Way Street (live)"
lengthGreatestHits <- "16080 ; 16020 ; 11040 ; 17820 ; 11700 ; 12780 ; 8100 ; 12480 ; 16860 ; 12240 ; 13680 ; 13620 ; 14700 ; 16080 ; 14460 ; 16560 ; 24000"
songLive <- "Back in the Saddle ; Sweet Emotion ; Lord of the Thighs ; Toys in the Attic ; Last Child ; Come Together ; Walk this Way ; Sick as a Dog ; Dream On ; Chip Away the Stone ; Sight for Sore Eyes ; Mama Kin ; S.O.S. (Too Bad) ; I Ain't Got You ; Mother Popcorn/Draw the Line ; Train Kept A-Rollin'/Strangers in the Night"
lengthLive <- "15900 ; 16920 ; 26280 ; 13500 ; 12240 ; 17460 ; 13560 ; 16920 ; 16260 ; 15120 ; 11880 ; 13380 ; 9960 ; 14220 ; 41700 ; 17460"
aerosmith <- data.frame(song=strsplit(songAerosmith, " ; ")[[1]],
length=as.integer(strsplit(lengthAerosmith, " ; ")[[1]]),
stringsAsFactors=FALSE
)
greatest_hits <- data.frame(song=strsplit(songGreatestHits, " ; ")[[1]],
length=as.integer(strsplit(lengthGreatestHits, " ; ")[[1]]),
stringsAsFactors=FALSE
)
myLive <- data.frame(song=strsplit(songLive, " ; ")[[1]],
length=as.integer(strsplit(lengthLive, " ; ")[[1]]),
stringsAsFactors=FALSE
)
aerosmith %>%
# Create the new dataset using a set operation
union(greatest_hits) %>%
# Count the total number of songs
nrow()
## [1] 24
# Create the new dataset using a set operation
aerosmith %>%
intersect(greatest_hits)
## song length
## 1 Dream On 16080
# Select the song names from live
live_songs <- myLive %>% select(song)
# Select the song names from greatest_hits
greatest_songs <- greatest_hits %>% select(song)
# Create the new dataset using a set operation
live_songs %>%
setdiff(greatest_songs)
## song
## 1 Lord of the Thighs
## 2 Toys in the Attic
## 3 Sick as a Dog
## 4 Sight for Sore Eyes
## 5 S.O.S. (Too Bad)
## 6 I Ain't Got You
## 7 Mother Popcorn/Draw the Line
## 8 Train Kept A-Rollin'/Strangers in the Night
# Select songs from live and greatest_hits
live_songs <- select(myLive, song)
greatest_songs <- select(greatest_hits, song)
# Return the songs that only exist in one dataset
union(setdiff(live_songs, greatest_songs), setdiff(greatest_songs, live_songs))
## song
## 1 Lord of the Thighs
## 2 Toys in the Attic
## 3 Sick as a Dog
## 4 Sight for Sore Eyes
## 5 S.O.S. (Too Bad)
## 6 I Ain't Got You
## 7 Mother Popcorn/Draw the Line
## 8 Train Kept A-Rollin'/Strangers in the Night
## 9 Same Old Song and Dance
## 10 Seasons of Winter
## 11 Big Ten Inch Record
## 12 Draw the Line
## 13 Kings and Queens
## 14 Remember (Walking in the Sand)
## 15 Lightning Strikes
## 16 Sweet Emotion (remix)
## 17 One Way Street (live)
# DO NOT HAVE DATA - NEED TO SKIP
# Check if same order: definitive and complete
# identical(definitive, complete)
# Check if any order: definitive and complete
# setequal(definitive, complete)
# Songs in definitive but not complete
# setdiff(definitive, complete)
# Songs in complete but not definitive
# setdiff(complete, definitive)
# Return songs in definitive that are not in complete
# definitive %>%
# anti_join(complete, by=c("song", "album"))
# Return songs in complete that are not in definitive
# complete %>%
# anti_join(definitive, by=c("song", "album"))
# Check if same order: definitive and union of complete and soundtrack
# identical(definitive, union(complete, soundtrack))
# Check if any order: definitive and union of complete and soundtrack
# setequal(definitive, union(complete, soundtrack))
Chapter 3 - Assembling Data
Binding is the process of either combining columns for datasets that have the same rows, or combining rows for datasets that have the same columns:
Building a better data frame - equivalents for data.frame and as.data.frame:
Working with data types - R typically behaves intuitively:
General coercion rules - more specific types of data will generally be converted to less specific types of data:
Example code includes:
songSideOne <- "Speak to Me ; Breathe ; On the Run ; Time ; The Great Gig in the Sky"
lengthSideOne <- "5400 ; 9780 ; 12600 ; 24780 ; 15300"
songSideTwo <-"Money ; Us and Them ; Any Colour You Like ; Brain Damage ; Eclipse"
lengthSideTwo <-"23400 ; 28260 ; 12240 ; 13800 ; 7380"
side_one <- data.frame(song=strsplit(songSideOne, " ; ")[[1]],
length=as.integer(strsplit(lengthSideOne, " ; ")[[1]]),
stringsAsFactors=FALSE
)
side_two <- data.frame(song=strsplit(songSideTwo, " ; ")[[1]],
length=as.integer(strsplit(lengthSideTwo, " ; ")[[1]]),
stringsAsFactors=FALSE
)
# Examine side_one and side_two
side_one
## song length
## 1 Speak to Me 5400
## 2 Breathe 9780
## 3 On the Run 12600
## 4 Time 24780
## 5 The Great Gig in the Sky 15300
side_two
## song length
## 1 Money 23400
## 2 Us and Them 28260
## 3 Any Colour You Like 12240
## 4 Brain Damage 13800
## 5 Eclipse 7380
# Bind side_one and side_two into a single dataset
side_one %>%
bind_rows(side_two)
## song length
## 1 Speak to Me 5400
## 2 Breathe 9780
## 3 On the Run 12600
## 4 Time 24780
## 5 The Great Gig in the Sky 15300
## 6 Money 23400
## 7 Us and Them 28260
## 8 Any Colour You Like 12240
## 9 Brain Damage 13800
## 10 Eclipse 7380
# Create shorter version of jimi
jimi <- list(data.frame(song=c("Purple Haze", "Hey Joe", "Fire"),
length=c(9960L, 12180L, 9240L),
stringsAsFactors=FALSE
),
data.frame(song=c("EXP", "Little Wing", "Little Miss Lover", "Bold as Love"),
length=c(6900L, 8640L, 8400L, 15060L),
stringsAsFactors=FALSE
),
data.frame(song=c("Voodoo Chile", "Gypsy Eyes"),
length=c(54000L, 13380L),
stringsAsFactors=FALSE
)
)
names(jimi) <- c("Are You Experienced", "Axis: Bold As Love", "Electric Ladyland")
discography <- data.frame(album=names(jimi),
year=c(1967L, 1967L, 1968L),
stringsAsFactors=FALSE
)
# Examine discography and jimi
discography
## album year
## 1 Are You Experienced 1967
## 2 Axis: Bold As Love 1967
## 3 Electric Ladyland 1968
jimi
## $`Are You Experienced`
## song length
## 1 Purple Haze 9960
## 2 Hey Joe 12180
## 3 Fire 9240
##
## $`Axis: Bold As Love`
## song length
## 1 EXP 6900
## 2 Little Wing 8640
## 3 Little Miss Lover 8400
## 4 Bold as Love 15060
##
## $`Electric Ladyland`
## song length
## 1 Voodoo Chile 54000
## 2 Gypsy Eyes 13380
jimi %>%
# Bind jimi into a single data frame
bind_rows(.id="album") %>%
# Make a complete data frame
left_join(discography, by="album")
## album song length year
## 1 Are You Experienced Purple Haze 9960 1967
## 2 Are You Experienced Hey Joe 12180 1967
## 3 Are You Experienced Fire 9240 1967
## 4 Axis: Bold As Love EXP 6900 1967
## 5 Axis: Bold As Love Little Wing 8640 1967
## 6 Axis: Bold As Love Little Miss Lover 8400 1967
## 7 Axis: Bold As Love Bold as Love 15060 1967
## 8 Electric Ladyland Voodoo Chile 54000 1968
## 9 Electric Ladyland Gypsy Eyes 13380 1968
# Create the hank data
songHankYears <- "Move It On Over ; My Love for You (Has Turned to Hate) ; Never Again (Will I Knock on Your Door) ; On the Banks of the Old Ponchartrain ; Pan American ; Wealth Won't Save Your Soul ; A Mansion on the Hill ; Honky Tonkin' ; I Saw the Light ; I'm a Long Gone Daddy ; My Sweet Love Ain't Around ; I'm So Lonesome I Could Cry ; Lost Highway ; Lovesick Blues ; Mind Your Own Business ; My Bucket's Got a Hole in It ; Never Again (Will I Knock on Your Door) ; Wedding Bells ; You're Gonna Change (Or I'm Gonna Leave) ; I Just Don't Like This Kind of Living ; Long Gone Lonesome Blues ; Moanin' the Blues ; My Son Calls Another Man Daddy ; Nobody's Lonesome for Me ; They'll Never Take Her Love from Me ; Why Don't You Love Me ; Why Should We Try Anymore ; (I Heard That) Lonesome Whistle ; Baby, We're Really in Love ; Cold, Cold Heart ; Crazy Heart ; Dear John ; Hey Good Lookin' ; Howlin' At the Moon ; I Can't Help It (If I'm Still in Love With You) ; Half as Much ; Honky Tonk Blues ; I'll Never Get Out of This World Alive ; Jambalaya (On the Bayou) ; Settin' the Woods on Fire ; You Win Again ; Calling You ; I Won't Be Home No More ; Kaw-Liga ; Take These Chains from My Heart ; Weary Blues from Waitin' ; Your Cheatin' Heart ; (I'm Gonna) Sing, Sing, Sing ; How Can You Refuse Him Now ; I'm Satisfied with You ; You Better Keep It on Your Mind ; A Teardrop on a Rose ; At the First Fall of Snow ; Mother Is Gone ; Please Don't Let Me Love You ; Thank God ; A Home in Heaven ; California Zephyr ; Singing Waterfall ; There's No Room in My Heart for the Blues ; Leave Me Alone with the Blues ; Ready to Go Home ; The Waltz of the Wind ; Just Waitin' ; The Pale Horse and His Rider ; Kaw-Liga ; There's a Tear in My Beer"
yearHankYears <- "1947 ; 1947 ; 1947 ; 1947 ; 1947 ; 1947 ; 1948 ; 1948 ; 1948 ; 1948 ; 1948 ; 1949 ; 1949 ; 1949 ; 1949 ; 1949 ; 1949 ; 1949 ; 1949 ; 1950 ; 1950 ; 1950 ; 1950 ; 1950 ; 1950 ; 1950 ; 1950 ; 1951 ; 1951 ; 1951 ; 1951 ; 1951 ; 1951 ; 1951 ; 1951 ; 1952 ; 1952 ; 1952 ; 1952 ; 1952 ; 1952 ; 1953 ; 1953 ; 1953 ; 1953 ; 1953 ; 1953 ; 1954 ; 1954 ; 1954 ; 1954 ; 1955 ; 1955 ; 1955 ; 1955 ; 1955 ; 1956 ; 1956 ; 1956 ; 1956 ; 1957 ; 1957 ; 1957 ; 1958 ; 1965 ; 1966 ; 1989"
songHankCharts <- "(I Heard That) Lonesome Whistle ; (I'm Gonna) Sing, Sing, Sing ; A Home in Heaven ; A Mansion on the Hill ; A Teardrop on a Rose ; At the First Fall of Snow ; Baby, We're Really in Love ; California Zephyr ; Calling You ; Cold, Cold Heart ; Crazy Heart ; Dear John ; Half as Much ; Hey Good Lookin' ; Honky Tonk Blues ; Honky Tonkin' ; How Can You Refuse Him Now ; Howlin' At the Moon ; I Can't Help It (If I'm Still in Love With You) ; I Just Don't Like This Kind of Living ; I Saw the Light ; I Won't Be Home No More ; I'll Never Get Out of This World Alive ; I'm a Long Gone Daddy ; I'm Satisfied with You ; I'm So Lonesome I Could Cry ; Jambalaya (On the Bayou) ; Just Waitin' ; Kaw-Liga ; Kaw-Liga ; Leave Me Alone with the Blues ; Long Gone Lonesome Blues ; Lost Highway ; Lovesick Blues ; Mind Your Own Business ; Moanin' the Blues ; Mother Is Gone ; Move It On Over ; My Bucket's Got a Hole in It ; My Love for You (Has Turned to Hate) ; My Son Calls Another Man Daddy ; My Sweet Love Ain't Around ; Never Again (Will I Knock on Your Door) ; Never Again (Will I Knock on Your Door) ; Nobody's Lonesome for Me ; On the Banks of the Old Ponchartrain ; Pan American ; Please Don't Let Me Love You ; Ready to Go Home ; Settin' the Woods on Fire ; Singing Waterfall ; Take These Chains from My Heart ; Thank God ; The Pale Horse and His Rider ; The Waltz of the Wind ; There's a Tear in My Beer ; There's No Room in My Heart for the Blues ; They'll Never Take Her Love from Me ; Wealth Won't Save Your Soul ; Weary Blues from Waitin' ; Wedding Bells ; Why Don't You Love Me ; Why Should We Try Anymore ; You Better Keep It on Your Mind ; You Win Again ; You're Gonna Change (Or I'm Gonna Leave) ; Your Cheatin' Heart"
peakHankCharts <- "9 ; NA ; NA ; 12 ; NA ; NA ; 4 ; NA ; NA ; 1 ; 4 ; 8 ; 2 ; 1 ; 2 ; 14 ; NA ; 3 ; 2 ; 5 ; NA ; 4 ; 1 ; 6 ; NA ; 2 ; 1 ; NA ; 1 ; NA ; NA ; 1 ; 12 ; 1 ; 5 ; 1 ; NA ; 4 ; 2 ; NA ; 9 ; NA ; NA ; 6 ; 9 ; NA ; NA ; 9 ; NA ; 2 ; NA ; 1 ; NA ; NA ; NA ; 7 ; NA ; 5 ; NA ; 7 ; 2 ; 1 ; 9 ; NA ; 10 ; 4 ; 1"
hank_years <- data.frame(year=as.integer(strsplit(yearHankYears, " ; ")[[1]]),
song=strsplit(songHankYears, " ; ")[[1]],
stringsAsFactors=FALSE
)
hank_charts <- data.frame(song=strsplit(songHankCharts, " ; ")[[1]],
peak=as.integer(strsplit(peakHankCharts, " ; ")[[1]]),
stringsAsFactors=FALSE
)
## Warning in data.frame(song = strsplit(songHankCharts, " ; ")[[1]], peak =
## as.integer(strsplit(peakHankCharts, : NAs introduced by coercion
# Examine hank_years and hank_charts
tibble::as_tibble(hank_years)
## # A tibble: 67 × 2
## year song
## <int> <chr>
## 1 1947 Move It On Over
## 2 1947 My Love for You (Has Turned to Hate)
## 3 1947 Never Again (Will I Knock on Your Door)
## 4 1947 On the Banks of the Old Ponchartrain
## 5 1947 Pan American
## 6 1947 Wealth Won't Save Your Soul
## 7 1948 A Mansion on the Hill
## 8 1948 Honky Tonkin'
## 9 1948 I Saw the Light
## 10 1948 I'm a Long Gone Daddy
## # ... with 57 more rows
tibble::as_tibble(hank_charts)
## # A tibble: 67 × 2
## song peak
## <chr> <int>
## 1 (I Heard That) Lonesome Whistle 9
## 2 (I'm Gonna) Sing, Sing, Sing NA
## 3 A Home in Heaven NA
## 4 A Mansion on the Hill 12
## 5 A Teardrop on a Rose NA
## 6 At the First Fall of Snow NA
## 7 Baby, We're Really in Love 4
## 8 California Zephyr NA
## 9 Calling You NA
## 10 Cold, Cold Heart 1
## # ... with 57 more rows
a <- hank_years %>%
# Reorder hank_years alphabetically by song title
arrange(song) %>%
# Select just the year column
select(year) %>%
# Bind the year column
bind_cols(hank_charts) %>%
# Arrange the finished dataset
arrange(year, song)
a # see the results
## year song peak
## 1 1947 Move It On Over 4
## 2 1947 My Love for You (Has Turned to Hate) NA
## 3 1947 Never Again (Will I Knock on Your Door) NA
## 4 1947 On the Banks of the Old Ponchartrain NA
## 5 1947 Pan American NA
## 6 1947 Wealth Won't Save Your Soul NA
## 7 1948 A Mansion on the Hill 12
## 8 1948 Honky Tonkin' 14
## 9 1948 I'm Satisfied with You NA
## 10 1948 I Just Don't Like This Kind of Living 5
## 11 1948 My Sweet Love Ain't Around NA
## 12 1949 I Won't Be Home No More 4
## 13 1949 Lost Highway 12
## 14 1949 Lovesick Blues 1
## 15 1949 Mind Your Own Business 5
## 16 1949 My Bucket's Got a Hole in It 2
## 17 1949 Never Again (Will I Knock on Your Door) 6
## 18 1949 Wedding Bells 2
## 19 1949 You Better Keep It on Your Mind NA
## 20 1950 I'm a Long Gone Daddy 6
## 21 1950 Long Gone Lonesome Blues 1
## 22 1950 Moanin' the Blues 1
## 23 1950 My Son Calls Another Man Daddy 9
## 24 1950 Nobody's Lonesome for Me 9
## 25 1950 They'll Never Take Her Love from Me 5
## 26 1950 Why Don't You Love Me 1
## 27 1950 Why Should We Try Anymore 9
## 28 1951 (I'm Gonna) Sing, Sing, Sing NA
## 29 1951 Baby, We're Really in Love 4
## 30 1951 Cold, Cold Heart 1
## 31 1951 Crazy Heart 4
## 32 1951 Dear John 8
## 33 1951 Hey Good Lookin' 1
## 34 1951 Howlin' At the Moon 3
## 35 1951 I'll Never Get Out of This World Alive 1
## 36 1952 Half as Much 2
## 37 1952 Honky Tonk Blues 2
## 38 1952 I Can't Help It (If I'm Still in Love With You) 2
## 39 1952 Jambalaya (On the Bayou) 1
## 40 1952 Settin' the Woods on Fire 2
## 41 1952 You're Gonna Change (Or I'm Gonna Leave) 4
## 42 1953 Calling You NA
## 43 1953 I'm So Lonesome I Could Cry 2
## 44 1953 Kaw-Liga 1
## 45 1953 Take These Chains from My Heart 1
## 46 1953 Weary Blues from Waitin' 7
## 47 1953 Your Cheatin' Heart 1
## 48 1954 (I Heard That) Lonesome Whistle 9
## 49 1954 How Can You Refuse Him Now NA
## 50 1954 I Saw the Light NA
## 51 1954 You Win Again 10
## 52 1955 A Teardrop on a Rose NA
## 53 1955 At the First Fall of Snow NA
## 54 1955 Mother Is Gone NA
## 55 1955 Please Don't Let Me Love You 9
## 56 1955 Thank God NA
## 57 1956 A Home in Heaven NA
## 58 1956 California Zephyr NA
## 59 1956 Singing Waterfall NA
## 60 1956 There's No Room in My Heart for the Blues NA
## 61 1957 Leave Me Alone with the Blues NA
## 62 1957 Ready to Go Home NA
## 63 1957 The Waltz of the Wind NA
## 64 1958 Just Waitin' NA
## 65 1965 The Pale Horse and His Rider NA
## 66 1966 Kaw-Liga NA
## 67 1989 There's a Tear in My Beer 7
hank_year <- a$year
hank_song <- a$song
hank_peak <- a$peak
# Make combined data frame using data_frame()
data_frame(year=hank_year, song=hank_song, peak=hank_peak) %>%
# Extract songs where peak equals 1
filter(peak == 1)
## # A tibble: 11 × 3
## year song peak
## <int> <chr> <int>
## 1 1949 Lovesick Blues 1
## 2 1950 Long Gone Lonesome Blues 1
## 3 1950 Moanin' the Blues 1
## 4 1950 Why Don't You Love Me 1
## 5 1951 Cold, Cold Heart 1
## 6 1951 Hey Good Lookin' 1
## 7 1951 I'll Never Get Out of This World Alive 1
## 8 1952 Jambalaya (On the Bayou) 1
## 9 1953 Kaw-Liga 1
## 10 1953 Take These Chains from My Heart 1
## 11 1953 Your Cheatin' Heart 1
hank <- list(year=hank_year, song=hank_song, peak=hank_peak)
# Examine the contents of hank
hank
## $year
## [1] 1947 1947 1947 1947 1947 1947 1948 1948 1948 1948 1948 1949 1949 1949
## [15] 1949 1949 1949 1949 1949 1950 1950 1950 1950 1950 1950 1950 1950 1951
## [29] 1951 1951 1951 1951 1951 1951 1951 1952 1952 1952 1952 1952 1952 1953
## [43] 1953 1953 1953 1953 1953 1954 1954 1954 1954 1955 1955 1955 1955 1955
## [57] 1956 1956 1956 1956 1957 1957 1957 1958 1965 1966 1989
##
## $song
## [1] "Move It On Over"
## [2] "My Love for You (Has Turned to Hate)"
## [3] "Never Again (Will I Knock on Your Door)"
## [4] "On the Banks of the Old Ponchartrain"
## [5] "Pan American"
## [6] "Wealth Won't Save Your Soul"
## [7] "A Mansion on the Hill"
## [8] "Honky Tonkin'"
## [9] "I'm Satisfied with You"
## [10] "I Just Don't Like This Kind of Living"
## [11] "My Sweet Love Ain't Around"
## [12] "I Won't Be Home No More"
## [13] "Lost Highway"
## [14] "Lovesick Blues"
## [15] "Mind Your Own Business"
## [16] "My Bucket's Got a Hole in It"
## [17] "Never Again (Will I Knock on Your Door)"
## [18] "Wedding Bells"
## [19] "You Better Keep It on Your Mind"
## [20] "I'm a Long Gone Daddy"
## [21] "Long Gone Lonesome Blues"
## [22] "Moanin' the Blues"
## [23] "My Son Calls Another Man Daddy"
## [24] "Nobody's Lonesome for Me"
## [25] "They'll Never Take Her Love from Me"
## [26] "Why Don't You Love Me"
## [27] "Why Should We Try Anymore"
## [28] "(I'm Gonna) Sing, Sing, Sing"
## [29] "Baby, We're Really in Love"
## [30] "Cold, Cold Heart"
## [31] "Crazy Heart"
## [32] "Dear John"
## [33] "Hey Good Lookin'"
## [34] "Howlin' At the Moon"
## [35] "I'll Never Get Out of This World Alive"
## [36] "Half as Much"
## [37] "Honky Tonk Blues"
## [38] "I Can't Help It (If I'm Still in Love With You)"
## [39] "Jambalaya (On the Bayou)"
## [40] "Settin' the Woods on Fire"
## [41] "You're Gonna Change (Or I'm Gonna Leave)"
## [42] "Calling You"
## [43] "I'm So Lonesome I Could Cry"
## [44] "Kaw-Liga"
## [45] "Take These Chains from My Heart"
## [46] "Weary Blues from Waitin'"
## [47] "Your Cheatin' Heart"
## [48] "(I Heard That) Lonesome Whistle"
## [49] "How Can You Refuse Him Now"
## [50] "I Saw the Light"
## [51] "You Win Again"
## [52] "A Teardrop on a Rose"
## [53] "At the First Fall of Snow"
## [54] "Mother Is Gone"
## [55] "Please Don't Let Me Love You"
## [56] "Thank God"
## [57] "A Home in Heaven"
## [58] "California Zephyr"
## [59] "Singing Waterfall"
## [60] "There's No Room in My Heart for the Blues"
## [61] "Leave Me Alone with the Blues"
## [62] "Ready to Go Home"
## [63] "The Waltz of the Wind"
## [64] "Just Waitin'"
## [65] "The Pale Horse and His Rider"
## [66] "Kaw-Liga"
## [67] "There's a Tear in My Beer"
##
## $peak
## [1] 4 NA NA NA NA NA 12 14 NA 5 NA 4 12 1 5 2 6 2 NA 6 1 1 9
## [24] 9 5 1 9 NA 4 1 4 8 1 3 1 2 2 2 1 2 4 NA 2 1 1 7
## [47] 1 9 NA NA 10 NA NA NA 9 NA NA NA NA NA NA NA NA NA NA NA 7
# Convert the hank list into a data frame
as_data_frame(hank) %>%
# Extract songs where peak equals 1
filter(peak == 1)
## # A tibble: 11 × 3
## year song peak
## <int> <chr> <int>
## 1 1949 Lovesick Blues 1
## 2 1950 Long Gone Lonesome Blues 1
## 3 1950 Moanin' the Blues 1
## 4 1950 Why Don't You Love Me 1
## 5 1951 Cold, Cold Heart 1
## 6 1951 Hey Good Lookin' 1
## 7 1951 I'll Never Get Out of This World Alive 1
## 8 1952 Jambalaya (On the Bayou) 1
## 9 1953 Kaw-Liga 1
## 10 1953 Take These Chains from My Heart 1
## 11 1953 Your Cheatin' Heart 1
### ** DO NOT RUN DUE TO NOT HAVING DATASET
# Examine the contents of michael
# michael
# bind_rows(michael, .id="album") %>%
# group_by(album) %>%
# mutate(rank = min_rank(peak)) %>%
# filter(rank == 1) %>%
# select(-rank, -peak)
y <- factor(c(5, 6, 7, 6))
y
## [1] 5 6 7 6
## Levels: 5 6 7
unclass(y)
## [1] 1 2 3 2
## attr(,"levels")
## [1] "5" "6" "7"
as.character(y)
## [1] "5" "6" "7" "6"
as.numeric(y)
## [1] 1 2 3 2
as.numeric(as.character(y))
## [1] 5 6 7 6
### ** DO NOT RUN DUE TO NOT HAVING DATASET
# seventies %>%
# Coerce seventies$year into a useful numeric
# mutate(year = as.numeric(as.character(year))) %>%
# Bind the updated version of seventies to sixties
# bind_rows(sixties) %>%
# arrange(year)
Chapter 4 - Advanced Joining
What can go wrong? General issues can be considered as a 2x2 matrix, where key values and/or key columns can be either missing and/or duplicated:
Defining the keys - expanding on the previous approaches that have always used by= explicitly in the join function:
Joining multiple tables is an extension of joining two tables:
Other implementations can be available:
Example code includes:
stage_songs <- data.frame(musical=c("Into the Woods", "West Side Story",
"Cats", "Phantom of the Opera"
),
year=c(1986L, 1957L, 1981L, 1986L),
stringsAsFactors=FALSE
)
rownames(stage_songs) <- c("Children Will Listen", "Maria",
"Memory", "The Music of the Night"
)
stage_writers <- data.frame(song=rownames(stage_songs),
composer=c("Stephen Sondheim", "Louis Bernstein",
"Andrew Lloyd Webber", "Andrew Lloyd Webber"
),
stringsAsFactors=FALSE
)
stage_songs %>%
# Add row names as a column named song
tibble::rownames_to_column(var="song") %>%
# Left join stage_writers to stage_songs
left_join(stage_writers, by="song")
## song musical year composer
## 1 Children Will Listen Into the Woods 1986 Stephen Sondheim
## 2 Maria West Side Story 1957 Louis Bernstein
## 3 Memory Cats 1981 Andrew Lloyd Webber
## 4 The Music of the Night Phantom of the Opera 1986 Andrew Lloyd Webber
singers <- data.frame(movie=c(NA, "The Sound of Music"),
singer=c("Arnold Schwarzenegger", "Julie Andrews"),
stringsAsFactors=FALSE
)
two_songs <- data.frame(movie=c("The Sound of Music", NA),
song=c("Do-Re-Mi", "A Spoonful of Sugar"),
stringsAsFactors=FALSE
)
# Examine the result of joining singers to two_songs
two_songs %>% inner_join(singers, by = "movie")
## movie song singer
## 1 The Sound of Music Do-Re-Mi Julie Andrews
## 2 <NA> A Spoonful of Sugar Arnold Schwarzenegger
# Remove NA's from key before joining
two_songs %>%
filter(!is.na(movie)) %>%
inner_join(singers, by = "movie")
## movie song singer
## 1 The Sound of Music Do-Re-Mi Julie Andrews
movieMovieYears <- "The Road to Morocco ; Going My Way ; Anchors Aweigh ; Till the Clouds Roll By ; White Christmas ; The Tender Trap ; High Society ; The Joker is Wild ; Pal Joey ; Can-Can"
nameMovieYears <- "Bing Crosby ; Bing Crosby ; Frank Sinatra ; Frank Sinatra ; Bing Crosby ; Frank Sinatra ; Bing Crosby ; Frank Sinatra ; Frank Sinatra ; Frank Sinatra"
yearMovieYears <- "1942 ; 1944 ; 1945 ; 1946 ; 1954 ; 1955 ; 1956 ; 1957 ; 1957 ; 1960"
movieMovieStudios <- "The Road to Morocco ; Going My Way ; Anchors Aweigh ; Till the Clouds Roll By ; White Christmas ; The Tender Trap ; High Society ; The Joker is Wild ; Pal Joey ; Can-Can"
nameMovieStudios <- "Paramount Pictures ; Paramount Pictures ; Metro-Goldwyn-Mayer ; Metro-Goldwyn-Mayer ; Paramount Pictures ; Metro-Goldwyn-Mayer ; Metro-Goldwyn-Mayer ; Paramount Pictures ; Columbia Pictures ; Twentieth-Century Fox"
movie_years <- data.frame(movie=strsplit(movieMovieYears, " ; ")[[1]],
name=strsplit(nameMovieYears, " ; ")[[1]],
year=as.integer(strsplit(yearMovieYears, " ; ")[[1]]),
stringsAsFactors=FALSE
)
movie_studios <- data.frame(movie=strsplit(movieMovieStudios, " ; ")[[1]],
name=strsplit(nameMovieStudios, " ; ")[[1]],
stringsAsFactors=FALSE
)
movie_years %>%
# Left join movie_studios to movie_years
left_join(movie_studios, by="movie") %>%
# Rename the columns: artist and studio
rename(artist=name.x, studio=name.y)
## movie artist year studio
## 1 The Road to Morocco Bing Crosby 1942 Paramount Pictures
## 2 Going My Way Bing Crosby 1944 Paramount Pictures
## 3 Anchors Aweigh Frank Sinatra 1945 Metro-Goldwyn-Mayer
## 4 Till the Clouds Roll By Frank Sinatra 1946 Metro-Goldwyn-Mayer
## 5 White Christmas Bing Crosby 1954 Paramount Pictures
## 6 The Tender Trap Frank Sinatra 1955 Metro-Goldwyn-Mayer
## 7 High Society Bing Crosby 1956 Metro-Goldwyn-Mayer
## 8 The Joker is Wild Frank Sinatra 1957 Paramount Pictures
## 9 Pal Joey Frank Sinatra 1957 Columbia Pictures
## 10 Can-Can Frank Sinatra 1960 Twentieth-Century Fox
elvis_movies <- data.frame(name=c("Jailhouse Rock", "Blue Hawaii",
"Viva Las Vegas", "Clambake"
),
year=c(1957L, 1961L, 1963L, 1967L),
stringsAsFactors=FALSE
)
elvTemp <- "(You're So Square) Baby I Don't Care ; I Can't Help Falling in Love ; Jailhouse Rock ; Viva Las Vegas ; You Don't Know Me"
elvis_songs <- data.frame(name=strsplit(elvTemp, " ; ")[[1]],
movie=elvis_movies$name[c(1, 2, 1, 3, 4)],
stringsAsFactors=FALSE
)
# Identify the key column
elvis_songs
## name movie
## 1 (You're So Square) Baby I Don't Care Jailhouse Rock
## 2 I Can't Help Falling in Love Blue Hawaii
## 3 Jailhouse Rock Jailhouse Rock
## 4 Viva Las Vegas Viva Las Vegas
## 5 You Don't Know Me Clambake
elvis_movies
## name year
## 1 Jailhouse Rock 1957
## 2 Blue Hawaii 1961
## 3 Viva Las Vegas 1963
## 4 Clambake 1967
elvis_movies %>%
# Left join elvis_songs to elvis_movies by this column
left_join(elvis_songs, by=c("name"="movie")) %>%
# Rename columns
rename(movie=name, song=name.y)
## movie year song
## 1 Jailhouse Rock 1957 (You're So Square) Baby I Don't Care
## 2 Jailhouse Rock 1957 Jailhouse Rock
## 3 Blue Hawaii 1961 I Can't Help Falling in Love
## 4 Viva Las Vegas 1963 Viva Las Vegas
## 5 Clambake 1967 You Don't Know Me
mdData <- "Anchors Aweigh ; Can-Can ; Going My Way ; High Society ; Pal Joey ; The Joker is Wild ; The Road to Morocco ; The Tender Trap ; Till the Clouds Roll By ; White Christmas : George Sidney ; Walter Lang ; Leo McCarey ; Charles Walters ; George Sidney ; Charles Vidor ; David Butler ; Charles Walters ; Richard Whorf ; Michael Curtiz : Metro-Goldwyn-Mayer ; Twentieth-Century Fox ; Paramount Pictures ; Metro-Goldwyn-Mayer ; Columbia Pictures ; Paramount Pictures ; Paramount Pictures ; Metro-Goldwyn-Mayer ; Metro-Goldwyn-Mayer ; Paramount Pictures"
myData <- "The Road to Morocco ; Going My Way ; Anchors Aweigh ; Till the Clouds Roll By ; White Christmas ; The Tender Trap ; High Society ; The Joker is Wild ; Pal Joey ; Can-Can : Bing Crosby ; Bing Crosby ; Frank Sinatra ; Frank Sinatra ; Bing Crosby ; Frank Sinatra ; Bing Crosby ; Frank Sinatra ; Frank Sinatra ; Frank Sinatra : 1942 ; 1944 ; 1945 ; 1946 ; 1954 ; 1955 ; 1956 ; 1957 ; 1957 ; 1960"
movie_directors <- as.data.frame(lapply(strsplit(mdData, " : "),
FUN=function(x) { strsplit(x, " ; ") }
),
stringsAsFactors=FALSE
)
names(movie_directors) <- c("name", "director", "studio")
movie_years <- as.data.frame(lapply(strsplit(myData, " : "),
FUN=function(x) { strsplit(x, " ; ") }
),
stringsAsFactors=FALSE
)
names(movie_years) <- c("movie", "name", "year")
movie_years$year <- as.integer(movie_years$year)
# Identify the key columns
movie_directors
## name director studio
## 1 Anchors Aweigh George Sidney Metro-Goldwyn-Mayer
## 2 Can-Can Walter Lang Twentieth-Century Fox
## 3 Going My Way Leo McCarey Paramount Pictures
## 4 High Society Charles Walters Metro-Goldwyn-Mayer
## 5 Pal Joey George Sidney Columbia Pictures
## 6 The Joker is Wild Charles Vidor Paramount Pictures
## 7 The Road to Morocco David Butler Paramount Pictures
## 8 The Tender Trap Charles Walters Metro-Goldwyn-Mayer
## 9 Till the Clouds Roll By Richard Whorf Metro-Goldwyn-Mayer
## 10 White Christmas Michael Curtiz Paramount Pictures
movie_years
## movie name year
## 1 The Road to Morocco Bing Crosby 1942
## 2 Going My Way Bing Crosby 1944
## 3 Anchors Aweigh Frank Sinatra 1945
## 4 Till the Clouds Roll By Frank Sinatra 1946
## 5 White Christmas Bing Crosby 1954
## 6 The Tender Trap Frank Sinatra 1955
## 7 High Society Bing Crosby 1956
## 8 The Joker is Wild Frank Sinatra 1957
## 9 Pal Joey Frank Sinatra 1957
## 10 Can-Can Frank Sinatra 1960
movie_years %>%
# Left join movie_directors to movie_years
left_join(movie_directors, by=c("movie"="name")) %>%
# Arrange the columns using select()
rename(artist=name) %>%
select(year, movie, artist, director, studio)
## year movie artist director
## 1 1942 The Road to Morocco Bing Crosby David Butler
## 2 1944 Going My Way Bing Crosby Leo McCarey
## 3 1945 Anchors Aweigh Frank Sinatra George Sidney
## 4 1946 Till the Clouds Roll By Frank Sinatra Richard Whorf
## 5 1954 White Christmas Bing Crosby Michael Curtiz
## 6 1955 The Tender Trap Frank Sinatra Charles Walters
## 7 1956 High Society Bing Crosby Charles Walters
## 8 1957 The Joker is Wild Frank Sinatra Charles Vidor
## 9 1957 Pal Joey Frank Sinatra George Sidney
## 10 1960 Can-Can Frank Sinatra Walter Lang
## studio
## 1 Paramount Pictures
## 2 Paramount Pictures
## 3 Metro-Goldwyn-Mayer
## 4 Metro-Goldwyn-Mayer
## 5 Paramount Pictures
## 6 Metro-Goldwyn-Mayer
## 7 Metro-Goldwyn-Mayer
## 8 Paramount Pictures
## 9 Columbia Pictures
## 10 Twentieth-Century Fox
### *** DO NOT RUN DUE TO NOT HAVING DATA
# Place supergroups, more_bands, and more_artists into a list
# list(supergroups, more_bands, more_artists) %>%
# Use reduce to join together the contents of the list
# purrr::reduce(left_join, by=c("first", "last"))
# list(more_artists, more_bands, supergroups) %>%
# Return rows of more_artists in all three datasets
# purrr::reduce(semi_join, by=c("first", "last"))
# Data is available from previous
# Alter the code to perform the join with a dplyr function
merge(bands, artists, by = c("first", "last"), all.x = TRUE) %>%
arrange(band)
## first last band instrument
## 1 Jimmy Page Led Zeppelin Guitar
## 2 John Bonham Led Zeppelin <NA>
## 3 John Paul Jones Led Zeppelin <NA>
## 4 Robert Plant Led Zeppelin <NA>
## 5 George Harrison The Beatles Guitar
## 6 John Lennon The Beatles Guitar
## 7 Paul McCartney The Beatles Bass
## 8 Ringo Starr The Beatles Drums
## 9 Jimmy Buffett The Coral Reefers Guitar
## 10 Charlie Watts The Rolling Stones <NA>
## 11 Keith Richards The Rolling Stones Guitar
## 12 Mick Jagger The Rolling Stones Vocals
## 13 Ronnie Woods The Rolling Stones <NA>
bands %>%
left_join(artists, by=c("first", "last"))
## first last band instrument
## 1 John Bonham Led Zeppelin <NA>
## 2 John Paul Jones Led Zeppelin <NA>
## 3 Jimmy Page Led Zeppelin Guitar
## 4 Robert Plant Led Zeppelin <NA>
## 5 George Harrison The Beatles Guitar
## 6 John Lennon The Beatles Guitar
## 7 Paul McCartney The Beatles Bass
## 8 Ringo Starr The Beatles Drums
## 9 Jimmy Buffett The Coral Reefers Guitar
## 10 Mick Jagger The Rolling Stones Vocals
## 11 Keith Richards The Rolling Stones Guitar
## 12 Charlie Watts The Rolling Stones <NA>
## 13 Ronnie Woods The Rolling Stones <NA>
Chapter 5 - Case Study
Lahman’s Baseball Database - the Sean Lahman package containing 26 tables, accessed through library(Lahman):
Using the “Salaries” data:
The dataset “HallOfFame” contains the votes and inductions by player:
Example code includes:
library(Lahman)
## Warning: package 'Lahman' was built under R version 3.2.5
# This will be missing battingLabels, fieldingLabels, pitchingLabels, LahmanData
lahmanNames <- lapply(LahmanData[, "file"],
FUN=function(x) {
data.frame(var=names(get(x)), stringsAsFactors=FALSE)
}
)
names(lahmanNames) <- LahmanData$file
# Examine lahmanNames
lahmanNames
## $AllstarFull
## var
## 1 playerID
## 2 yearID
## 3 gameNum
## 4 gameID
## 5 teamID
## 6 lgID
## 7 GP
## 8 startingPos
##
## $Appearances
## var
## 1 yearID
## 2 teamID
## 3 lgID
## 4 playerID
## 5 G_all
## 6 GS
## 7 G_batting
## 8 G_defense
## 9 G_p
## 10 G_c
## 11 G_1b
## 12 G_2b
## 13 G_3b
## 14 G_ss
## 15 G_lf
## 16 G_cf
## 17 G_rf
## 18 G_of
## 19 G_dh
## 20 G_ph
## 21 G_pr
##
## $AwardsManagers
## var
## 1 playerID
## 2 awardID
## 3 yearID
## 4 lgID
## 5 tie
## 6 notes
##
## $AwardsPlayers
## var
## 1 playerID
## 2 awardID
## 3 yearID
## 4 lgID
## 5 tie
## 6 notes
##
## $AwardsShareManagers
## var
## 1 awardID
## 2 yearID
## 3 lgID
## 4 playerID
## 5 pointsWon
## 6 pointsMax
## 7 votesFirst
##
## $AwardsSharePlayers
## var
## 1 awardID
## 2 yearID
## 3 lgID
## 4 playerID
## 5 pointsWon
## 6 pointsMax
## 7 votesFirst
##
## $Batting
## var
## 1 playerID
## 2 yearID
## 3 stint
## 4 teamID
## 5 lgID
## 6 G
## 7 AB
## 8 R
## 9 H
## 10 X2B
## 11 X3B
## 12 HR
## 13 RBI
## 14 SB
## 15 CS
## 16 BB
## 17 SO
## 18 IBB
## 19 HBP
## 20 SH
## 21 SF
## 22 GIDP
##
## $BattingPost
## var
## 1 yearID
## 2 round
## 3 playerID
## 4 teamID
## 5 lgID
## 6 G
## 7 AB
## 8 R
## 9 H
## 10 X2B
## 11 X3B
## 12 HR
## 13 RBI
## 14 SB
## 15 CS
## 16 BB
## 17 SO
## 18 IBB
## 19 HBP
## 20 SH
## 21 SF
## 22 GIDP
##
## $CollegePlaying
## var
## 1 playerID
## 2 schoolID
## 3 yearID
##
## $Fielding
## var
## 1 playerID
## 2 yearID
## 3 stint
## 4 teamID
## 5 lgID
## 6 POS
## 7 G
## 8 GS
## 9 InnOuts
## 10 PO
## 11 A
## 12 E
## 13 DP
## 14 PB
## 15 WP
## 16 SB
## 17 CS
## 18 ZR
##
## $FieldingOF
## var
## 1 playerID
## 2 yearID
## 3 stint
## 4 Glf
## 5 Gcf
## 6 Grf
##
## $FieldingPost
## var
## 1 playerID
## 2 yearID
## 3 teamID
## 4 lgID
## 5 round
## 6 POS
## 7 G
## 8 GS
## 9 InnOuts
## 10 PO
## 11 A
## 12 E
## 13 DP
## 14 TP
## 15 PB
## 16 SB
## 17 CS
##
## $HallOfFame
## var
## 1 playerID
## 2 yearID
## 3 votedBy
## 4 ballots
## 5 needed
## 6 votes
## 7 inducted
## 8 category
## 9 needed_note
##
## $Managers
## var
## 1 playerID
## 2 yearID
## 3 teamID
## 4 lgID
## 5 inseason
## 6 G
## 7 W
## 8 L
## 9 rank
## 10 plyrMgr
##
## $ManagersHalf
## var
## 1 playerID
## 2 yearID
## 3 teamID
## 4 lgID
## 5 inseason
## 6 half
## 7 G
## 8 W
## 9 L
## 10 rank
##
## $Master
## var
## 1 playerID
## 2 birthYear
## 3 birthMonth
## 4 birthDay
## 5 birthCountry
## 6 birthState
## 7 birthCity
## 8 deathYear
## 9 deathMonth
## 10 deathDay
## 11 deathCountry
## 12 deathState
## 13 deathCity
## 14 nameFirst
## 15 nameLast
## 16 nameGiven
## 17 weight
## 18 height
## 19 bats
## 20 throws
## 21 debut
## 22 finalGame
## 23 retroID
## 24 bbrefID
## 25 deathDate
## 26 birthDate
##
## $Pitching
## var
## 1 playerID
## 2 yearID
## 3 stint
## 4 teamID
## 5 lgID
## 6 W
## 7 L
## 8 G
## 9 GS
## 10 CG
## 11 SHO
## 12 SV
## 13 IPouts
## 14 H
## 15 ER
## 16 HR
## 17 BB
## 18 SO
## 19 BAOpp
## 20 ERA
## 21 IBB
## 22 WP
## 23 HBP
## 24 BK
## 25 BFP
## 26 GF
## 27 R
## 28 SH
## 29 SF
## 30 GIDP
##
## $PitchingPost
## var
## 1 playerID
## 2 yearID
## 3 round
## 4 teamID
## 5 lgID
## 6 W
## 7 L
## 8 G
## 9 GS
## 10 CG
## 11 SHO
## 12 SV
## 13 IPouts
## 14 H
## 15 ER
## 16 HR
## 17 BB
## 18 SO
## 19 BAOpp
## 20 ERA
## 21 IBB
## 22 WP
## 23 HBP
## 24 BK
## 25 BFP
## 26 GF
## 27 R
## 28 SH
## 29 SF
## 30 GIDP
##
## $Salaries
## var
## 1 yearID
## 2 teamID
## 3 lgID
## 4 playerID
## 5 salary
##
## $Schools
## var
## 1 schoolID
## 2 name_full
## 3 city
## 4 state
## 5 country
##
## $SeriesPost
## var
## 1 yearID
## 2 round
## 3 teamIDwinner
## 4 lgIDwinner
## 5 teamIDloser
## 6 lgIDloser
## 7 wins
## 8 losses
## 9 ties
##
## $Teams
## var
## 1 yearID
## 2 lgID
## 3 teamID
## 4 franchID
## 5 divID
## 6 Rank
## 7 G
## 8 Ghome
## 9 W
## 10 L
## 11 DivWin
## 12 WCWin
## 13 LgWin
## 14 WSWin
## 15 R
## 16 AB
## 17 H
## 18 X2B
## 19 X3B
## 20 HR
## 21 BB
## 22 SO
## 23 SB
## 24 CS
## 25 HBP
## 26 SF
## 27 RA
## 28 ER
## 29 ERA
## 30 CG
## 31 SHO
## 32 SV
## 33 IPouts
## 34 HA
## 35 HRA
## 36 BBA
## 37 SOA
## 38 E
## 39 DP
## 40 FP
## 41 name
## 42 park
## 43 attendance
## 44 BPF
## 45 PPF
## 46 teamIDBR
## 47 teamIDlahman45
## 48 teamIDretro
##
## $TeamsFranchises
## var
## 1 franchID
## 2 franchName
## 3 active
## 4 NAassoc
##
## $TeamsHalf
## var
## 1 yearID
## 2 lgID
## 3 teamID
## 4 Half
## 5 divID
## 6 DivWin
## 7 Rank
## 8 G
## 9 W
## 10 L
# Find variables in common
purrr::reduce(lahmanNames, intersect)
## [1] var
## <0 rows> (or 0-length row.names)
lahmanNames %>%
# Bind the data frames in lahmanNames
bind_rows(.id="dataframe") %>%
# Group the result by var
group_by(var) %>%
# Tally the number of appearances
tally() %>%
# Filter the data
filter(n > 1) %>%
# Arrange the results
arrange(-n)
## # A tibble: 57 × 2
## var n
## <chr> <int>
## 1 yearID 21
## 2 playerID 19
## 3 lgID 17
## 4 teamID 13
## 5 G 10
## 6 L 6
## 7 W 6
## 8 BB 5
## 9 CS 5
## 10 GS 5
## # ... with 47 more rows
lahmanNames %>%
# Bind the data frames
bind_rows(.id="dataframe") %>%
# Filter the results
filter(var=="playerID") %>%
# Extract the dataframe variable
`$`(dataframe)
## [1] "AllstarFull" "Appearances" "AwardsManagers"
## [4] "AwardsPlayers" "AwardsShareManagers" "AwardsSharePlayers"
## [7] "Batting" "BattingPost" "CollegePlaying"
## [10] "Fielding" "FieldingOF" "FieldingPost"
## [13] "HallOfFame" "Managers" "ManagersHalf"
## [16] "Master" "Pitching" "PitchingPost"
## [19] "Salaries"
players <- Master %>%
# Return the columns playerID, nameFirst and nameLast
select(playerID, nameFirst, nameLast) %>%
# Return one row for each distinct player
distinct()
players %>%
# Find all players who do not appear in Salaries
anti_join(Salaries, by="playerID") %>%
# Count them
count()
## # A tibble: 1 × 1
## n
## <int>
## 1 13888
players %>%
anti_join(Salaries, by = "playerID") %>%
# How many unsalaried players appear in Appearances?
semi_join(Appearances, by="playerID") %>%
count()
## # A tibble: 1 × 1
## n
## <int>
## 1 13695
players %>%
# Find all players who do not appear in Salaries
anti_join(Salaries, by="playerID") %>%
# Join them to Appearances
left_join(Appearances, by="playerID") %>%
# Calculate total_games for each player
group_by(playerID) %>%
summarize(total_games=sum(G_all, na.rm=TRUE)) %>%
# Arrange in descending order by total_games
arrange(-total_games)
## # A tibble: 13,888 × 2
## playerID total_games
## <chr> <int>
## 1 yastrca01 3308
## 2 aaronha01 3298
## 3 cobbty01 3034
## 4 musiast01 3026
## 5 mayswi01 2992
## 6 robinbr01 2896
## 7 kalinal01 2834
## 8 collied01 2826
## 9 robinfr02 2808
## 10 wagneho01 2794
## # ... with 13,878 more rows
players %>%
# Find unsalaried players
anti_join(Salaries, by="playerID") %>%
# Join Batting to the unsalaried players
left_join(Batting, by="playerID") %>%
# Group by player
group_by(playerID) %>%
# Sum at-bats for each player
summarize(total_games=sum(AB, na.rm=TRUE)) %>%
# Arrange in descending order
arrange(-total_games)
## # A tibble: 13,888 × 2
## playerID total_games
## <chr> <int>
## 1 aaronha01 12364
## 2 yastrca01 11988
## 3 cobbty01 11434
## 4 musiast01 10972
## 5 mayswi01 10881
## 6 robinbr01 10654
## 7 wagneho01 10430
## 8 brocklo01 10332
## 9 ansonca01 10277
## 10 aparilu01 10230
## # ... with 13,878 more rows
# Find the distinct players that appear in HallOfFame
nominated <- HallOfFame %>%
select(playerID) %>%
distinct()
nominated %>%
# Count the number of players in nominated
count()
## # A tibble: 1 × 1
## n
## <int>
## 1 1239
nominated_full <- nominated %>%
# Join to Master
left_join(Master, by="playerID") %>%
# Return playerID, nameFirst, nameLast
select(playerID, nameFirst, nameLast)
# Find distinct players in HallOfFame with inducted == "Y"
inducted <- HallOfFame %>%
filter(inducted == "Y") %>%
select(playerID) %>%
distinct()
inducted %>%
# Count the number of players in nominated
count()
## # A tibble: 1 × 1
## n
## <int>
## 1 312
inducted_full <- inducted %>%
# Join to Master
left_join(Master, by="playerID") %>%
# Return playerID, nameFirst, nameLast
select(playerID, nameFirst, nameLast)
# Tally the number of awards in AwardsPlayers by playerID
nAwards <- AwardsPlayers %>%
group_by(playerID) %>%
tally()
nAwards %>%
# Filter to just the players in inducted
semi_join(inducted, by="playerID") %>%
# Calculate the mean number of awards per player
summarize(avg_n=mean(n, na.rm=TRUE))
## # A tibble: 1 × 1
## avg_n
## <dbl>
## 1 12.10582
nAwards %>%
# Filter to just the players in nominated
semi_join(nominated, by="playerID") %>%
# Filter to players NOT in inducted
anti_join(inducted, by="playerID") %>%
# Calculate the mean number of awards per player
summarize(avg_n=mean(n, na.rm=TRUE))
## # A tibble: 1 × 1
## avg_n
## <dbl>
## 1 4.18985
# Find the players who are in nominated, but not inducted
notInducted <- nominated %>%
setdiff(inducted)
Salaries %>%
# Find the players who are in notInducted
semi_join(notInducted, by="playerID") %>%
# Calculate the max salary by player
group_by(playerID) %>%
summarize(max_salary=max(salary)) %>%
# Calculate the average of the max salaries
summarize(avg_salary=mean(max_salary))
## # A tibble: 1 × 1
## avg_salary
## <dbl>
## 1 4876812
# Repeat for players who were inducted
Salaries %>%
semi_join(inducted, by="playerID") %>%
group_by(playerID) %>%
summarize(max_salary=max(salary)) %>%
summarize(avg_salary=mean(max_salary))
## # A tibble: 1 × 1
## avg_salary
## <dbl>
## 1 5673190
Appearances %>%
# Filter Appearances against nominated
semi_join(nominated, by="playerID") %>%
# Find last year played by player
group_by(playerID) %>%
summarize(last_year=max(yearID)) %>%
# Join to full HallOfFame
left_join(HallOfFame, by="playerID") %>%
# Filter for unusual observations
filter((yearID - last_year) < 1)
## # A tibble: 39 × 10
## playerID last_year yearID votedBy ballots needed votes
## <chr> <int> <int> <chr> <int> <int> <int>
## 1 cissebi01 1938 1937 BBWAA 201 151 1
## 2 cochrmi01 1937 1936 BBWAA 226 170 80
## 3 deandi01 1947 1945 BBWAA 247 186 17
## 4 deandi01 1947 1946 Final Ballot 263 198 45
## 5 deandi01 1947 1946 Nominating Vote 202 NA 40
## 6 deandi01 1947 1947 BBWAA 161 121 88
## 7 dickebi01 1946 1945 BBWAA 247 186 17
## 8 dickebi01 1946 1946 Nominating Vote 202 NA 40
## 9 dickebi01 1946 1946 Final Ballot 263 198 32
## 10 dimagjo01 1951 1945 BBWAA 247 186 1
## # ... with 29 more rows, and 3 more variables: inducted <fctr>,
## # category <fctr>, needed_note <chr>
The data.table library is designed to simplify and speed up work with large datasets. The language is broadly analogous to SQL, with syntax that includes equivalents for SELECT, WHERE, and GROUP BY. Some general attributes of a data.table object include:
NOTE - all data.table are also data.frame, and if a package is not aware of data.table, then it will act as data.frame for that package.
General syntax is:
Example table creation:
Some example code includes:
library(data.table)
DT <- data.table(a = c(1, 2), b=LETTERS[1:4])
str(DT)
## Classes 'data.table' and 'data.frame': 4 obs. of 2 variables:
## $ a: num 1 2 1 2
## $ b: chr "A" "B" "C" "D"
## - attr(*, ".internal.selfref")=<externalptr>
DT
## a b
## 1: 1 A
## 2: 2 B
## 3: 1 C
## 4: 2 D
# Print the second to last row of DT using .N
DT[.N-1]
## a b
## 1: 1 C
# Print the column names of DT
names(DT)
## [1] "a" "b"
# Print the number or rows and columns of DT
dim(DT)
## [1] 4 2
# Select row 2 twice and row 3, returning a data.table with three rows where row 2 is a duplicate of row 1.
DT[c(2, 2:3)]
## a b
## 1: 2 B
## 2: 2 B
## 3: 1 C
DT <- data.table(A = 1:5, B = letters[1:5], C = 6:10)
str(DT)
## Classes 'data.table' and 'data.frame': 5 obs. of 3 variables:
## $ A: int 1 2 3 4 5
## $ B: chr "a" "b" "c" "d" ...
## $ C: int 6 7 8 9 10
## - attr(*, ".internal.selfref")=<externalptr>
DT
## A B C
## 1: 1 a 6
## 2: 2 b 7
## 3: 3 c 8
## 4: 4 d 9
## 5: 5 e 10
# Subset rows 1 and 3, and columns B and C
DT[c(1, 3), .(B, C)]
## B C
## 1: a 6
## 2: c 8
# Assign to ans the correct value
ans <- DT[ , .(B, val=A*C)]
ans
## B val
## 1: a 6
## 2: b 14
## 3: c 24
## 4: d 36
## 5: e 50
# Fill in the blanks such that ans2 equals target
target <- data.table(B = c("a", "b", "c", "d", "e", "a", "b", "c", "d", "e"),
val = as.integer(c(6:10, 1:5))
)
ans2 <- DT[, .(B, val = c(C, A))]
identical(target, ans2)
## [1] TRUE
DT <- as.data.table(iris)
str(DT)
## Classes 'data.table' and 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, ".internal.selfref")=<externalptr>
# For each Species, print the mean Sepal.Length
DT[ , mean(Sepal.Length), Species]
## Species V1
## 1: setosa 5.006
## 2: versicolor 5.936
## 3: virginica 6.588
# Print mean Sepal.Length, grouping by first letter of Species
DT[ , mean(Sepal.Length), substr(Species, 1, 1)]
## substr V1
## 1: s 5.006
## 2: v 6.262
str(DT)
## Classes 'data.table' and 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, ".internal.selfref")=<externalptr>
identical(DT, as.data.table(iris))
## [1] TRUE
# Group the specimens by Sepal area (to the nearest 10 cm2) and count how many occur in each group.
DT[, .N, by = 10 * round(Sepal.Length * Sepal.Width / 10)]
## round N
## 1: 20 117
## 2: 10 29
## 3: 30 4
# Now name the output columns `Area` and `Count`
DT[, .(Count=.N), by = .(Area = 10 * round(Sepal.Length * Sepal.Width / 10))]
## Area Count
## 1: 20 117
## 2: 10 29
## 3: 30 4
# Create the data.table DT
set.seed(1L)
DT <- data.table(A = rep(letters[2:1], each = 4L),
B = rep(1:4, each = 2L),
C = sample(8)
)
str(DT)
## Classes 'data.table' and 'data.frame': 8 obs. of 3 variables:
## $ A: chr "b" "b" "b" "b" ...
## $ B: int 1 1 2 2 3 3 4 4
## $ C: int 3 8 4 5 1 7 2 6
## - attr(*, ".internal.selfref")=<externalptr>
DT
## A B C
## 1: b 1 3
## 2: b 1 8
## 3: b 2 4
## 4: b 2 5
## 5: a 3 1
## 6: a 3 7
## 7: a 4 2
## 8: a 4 6
# Create the new data.table, DT2
DT2 <- DT[, .(C = cumsum(C)), by = .(A, B)]
str(DT2)
## Classes 'data.table' and 'data.frame': 8 obs. of 3 variables:
## $ A: chr "b" "b" "b" "b" ...
## $ B: int 1 1 2 2 3 3 4 4
## $ C: int 3 11 4 9 1 8 2 8
## - attr(*, ".internal.selfref")=<externalptr>
DT2
## A B C
## 1: b 1 3
## 2: b 1 11
## 3: b 2 4
## 4: b 2 9
## 5: a 3 1
## 6: a 3 8
## 7: a 4 2
## 8: a 4 8
# Select from DT2 the last two values from C while you group by A
DT2[, .(C = tail(C, 2)), by = A]
## A C
## 1: b 4
## 2: b 9
## 3: a 2
## 4: a 8
The chaining operation in data.table is run as [statement][next statement].
Example code includes:
set.seed(1L)
DT <- data.table(A = rep(letters[2:1], each = 4L),
B = rep(1:4, each = 2L),
C = sample(8))
str(DT)
## Classes 'data.table' and 'data.frame': 8 obs. of 3 variables:
## $ A: chr "b" "b" "b" "b" ...
## $ B: int 1 1 2 2 3 3 4 4
## $ C: int 3 8 4 5 1 7 2 6
## - attr(*, ".internal.selfref")=<externalptr>
DT
## A B C
## 1: b 1 3
## 2: b 1 8
## 3: b 2 4
## 4: b 2 5
## 5: a 3 1
## 6: a 3 7
## 7: a 4 2
## 8: a 4 6
# Perform operation using chaining
DT[ , .(C = cumsum(C)), by = .(A, B)][ , .(C = tail(C, 2)), by=.(A)]
## A C
## 1: b 4
## 2: b 9
## 3: a 2
## 4: a 8
data(iris)
DT <- as.data.table(iris)
str(DT)
## Classes 'data.table' and 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, ".internal.selfref")=<externalptr>
# Perform chained operations on DT
DT[ , .(Sepal.Length = median(Sepal.Length), Sepal.Width = median(Sepal.Width),
Petal.Length = median(Petal.Length), Petal.Width = median(Petal.Width)),
by=.(Species)][order(-Species)]
## Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1: virginica 6.5 3.0 5.55 2.0
## 2: versicolor 5.9 2.8 4.35 1.3
## 3: setosa 5.0 3.4 1.50 0.2
# Mean of columns
# DT[ , lapply(.SD, FUN=mean), by=.(x)]
# Median of columns
# DT[ , lapply(.SD, FUN=median), by=.(x)]
# Calculate the sum of the Q columns
# DT[ , lapply(.SD, FUN=sum), , .SDcols=2:4]
# Calculate the sum of columns H1 and H2
# DT[ , lapply(.SD, FUN=sum), , .SDcols=paste0("H", 1:2)]
# Select all but the first row of groups 1 and 2, returning only the grp column and the Q columns
# foo = function(x) { x[-1] }
# DT[ , lapply(.SD, FUN=foo), by=.(grp), .SDcols=paste0("Q", 1:3)]
# Sum of all columns and the number of rows
# DT[, c(lapply(.SD, FUN=sum), .N), by=.(x), .SDcols=names(DT)]
# Cumulative sum of column x and y while grouping by x and z > 8
# DT[, lapply(.SD, FUN=cumsum), by=.(by1=x, by2=(z>8)), .SDcols=c("x", "y")]
# Chaining
# DT[, lapply(.SD, FUN=cumsum), by=.(by1=x, by2=(z>8)), .SDcols=c("x", "y")][ , lapply(.SD, FUN=max), by=.(by1), .SDcols=c("x", "y")]
# The data.table DT
DT <- data.table(A = letters[c(1, 1, 1, 2, 2)], B = 1:5)
str(DT)
## Classes 'data.table' and 'data.frame': 5 obs. of 2 variables:
## $ A: chr "a" "a" "a" "b" ...
## $ B: int 1 2 3 4 5
## - attr(*, ".internal.selfref")=<externalptr>
DT
## A B
## 1: a 1
## 2: a 2
## 3: a 3
## 4: b 4
## 5: b 5
# Add column by reference: Total
DT[ , Total:=sum(B), by=.(A)]
DT
## A B Total
## 1: a 1 6
## 2: a 2 6
## 3: a 3 6
## 4: b 4 9
## 5: b 5 9
# Add 1 to column B
DT[c(2,4) , B:=B+1L, ]
DT
## A B Total
## 1: a 1 6
## 2: a 3 6
## 3: a 3 6
## 4: b 5 9
## 5: b 5 9
# Add a new column Total2
DT[2:4, Total2:=sum(B), by=.(A)]
DT
## A B Total Total2
## 1: a 1 6 NA
## 2: a 3 6 6
## 3: a 3 6 6
## 4: b 5 9 5
## 5: b 5 9 NA
# Remove the Total column
DT[ , Total := NULL, ]
DT
## A B Total2
## 1: a 1 NA
## 2: a 3 6
## 3: a 3 6
## 4: b 5 5
## 5: b 5 NA
# Select the third column using `[[`
DT[[3]]
## [1] NA 6 6 5 NA
# A data.table DT has been created for you
DT <- data.table(A = c(1, 1, 1, 2, 2), B = 1:5)
str(DT)
## Classes 'data.table' and 'data.frame': 5 obs. of 2 variables:
## $ A: num 1 1 1 2 2
## $ B: int 1 2 3 4 5
## - attr(*, ".internal.selfref")=<externalptr>
DT
## A B
## 1: 1 1
## 2: 1 2
## 3: 1 3
## 4: 2 4
## 5: 2 5
# Update B, add C and D
DT[ , c("B", "C", "D") := .(B + 1, A + B, 2), ]
DT
## A B C D
## 1: 1 2 2 2
## 2: 1 3 3 2
## 3: 1 4 4 2
## 4: 2 5 6 2
## 5: 2 6 7 2
# Delete my_cols
my_cols <- c("B", "C")
DT[ , (my_cols) := NULL, ]
DT
## A D
## 1: 1 2
## 2: 1 2
## 3: 1 2
## 4: 2 2
## 5: 2 2
# Delete column 2 by number
DT[[2]] <- NULL
DT
## A
## 1: 1
## 2: 1
## 3: 1
## 4: 2
## 5: 2
# Set the seed
# set.seed(1)
# Check the DT that is made available to you
# DT
# For loop with set
# for(i in 2:4) { set(DT, sample(nrow(DT), 3), i, NA) }
# Change the column names to lowercase
# setnames(DT, letters[1:4])
# Print the resulting DT to the console
# DT
# Define DT
DT <- data.table(a = letters[c(1, 1, 1, 2, 2)], b = 1)
str(DT)
## Classes 'data.table' and 'data.frame': 5 obs. of 2 variables:
## $ a: chr "a" "a" "a" "b" ...
## $ b: num 1 1 1 1 1
## - attr(*, ".internal.selfref")=<externalptr>
DT
## a b
## 1: a 1
## 2: a 1
## 3: a 1
## 4: b 1
## 5: b 1
# Add a suffix "_2" to all column names
setnames(DT, paste0(names(DT), "_2"))
DT
## a_2 b_2
## 1: a 1
## 2: a 1
## 3: a 1
## 4: b 1
## 5: b 1
# Change column name "a_2" to "A2"
setnames(DT, "a_2", "A2")
DT
## A2 b_2
## 1: a 1
## 2: a 1
## 3: a 1
## 4: b 1
## 5: b 1
# Reverse the order of the columns
setcolorder(DT, 2:1)
DT
## b_2 A2
## 1: 1 a
## 2: 1 a
## 3: 1 a
## 4: 1 b
## 5: 1 b
Example code includes:
# iris as a data.table
iris <- as.data.table(iris)
# Remove the "Sepal." prefix
names(iris) <- gsub("Sepal\\.", "", names(iris))
# Remove the two columns starting with "Petal"
iris[, c("Petal.Length", "Petal.Width") := NULL, ]
# Cleaned up iris data.table
str(iris)
## Classes 'data.table' and 'data.frame': 150 obs. of 3 variables:
## $ Length : num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Species: Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, ".internal.selfref")=<externalptr>
# Area is greater than 20 square centimeters
iris[ Width * Length > 20 ]
## Length Width Species
## 1: 5.4 3.9 setosa
## 2: 5.8 4.0 setosa
## 3: 5.7 4.4 setosa
## 4: 5.4 3.9 setosa
## 5: 5.7 3.8 setosa
## 6: 5.2 4.1 setosa
## 7: 5.5 4.2 setosa
## 8: 7.0 3.2 versicolor
## 9: 6.4 3.2 versicolor
## 10: 6.9 3.1 versicolor
## 11: 6.3 3.3 versicolor
## 12: 6.7 3.1 versicolor
## 13: 6.7 3.0 versicolor
## 14: 6.0 3.4 versicolor
## 15: 6.7 3.1 versicolor
## 16: 6.3 3.3 virginica
## 17: 7.1 3.0 virginica
## 18: 7.6 3.0 virginica
## 19: 7.3 2.9 virginica
## 20: 7.2 3.6 virginica
## 21: 6.5 3.2 virginica
## 22: 6.8 3.0 virginica
## 23: 6.4 3.2 virginica
## 24: 7.7 3.8 virginica
## 25: 7.7 2.6 virginica
## 26: 6.9 3.2 virginica
## 27: 7.7 2.8 virginica
## 28: 6.7 3.3 virginica
## 29: 7.2 3.2 virginica
## 30: 7.2 3.0 virginica
## 31: 7.4 2.8 virginica
## 32: 7.9 3.8 virginica
## 33: 7.7 3.0 virginica
## 34: 6.3 3.4 virginica
## 35: 6.9 3.1 virginica
## 36: 6.7 3.1 virginica
## 37: 6.9 3.1 virginica
## 38: 6.8 3.2 virginica
## 39: 6.7 3.3 virginica
## 40: 6.7 3.0 virginica
## 41: 6.2 3.4 virginica
## Length Width Species
# Add new boolean column
iris[, is_large := Width * Length > 25]
## Warning in `[.data.table`(iris, , `:=`(is_large, Width * Length > 25)):
## Invalid .internal.selfref detected and fixed by taking a (shallow) copy
## of the data.table so that := can add this new column by reference. At
## an earlier point, this data.table has been copied by R (or been created
## manually using structure() or similar). Avoid key<-, names<- and attr<-
## which in R currently (and oddly) may copy the whole data.table. Use set*
## syntax instead to avoid copying: ?set, ?setnames and ?setattr. Also, in
## R<=v3.0.2, list(DT1,DT2) copied the entire DT1 and DT2 (R's list() used to
## copy named objects); please upgrade to R>v3.0.2 if that is biting. If this
## message doesn't help, please report to datatable-help so the root cause can
## be fixed.
# Now large observations with is_large
iris[is_large == TRUE]
## Length Width Species is_large
## 1: 5.7 4.4 setosa TRUE
## 2: 7.2 3.6 virginica TRUE
## 3: 7.7 3.8 virginica TRUE
## 4: 7.9 3.8 virginica TRUE
iris[(is_large)] # Also OK
## Length Width Species is_large
## 1: 5.7 4.4 setosa TRUE
## 2: 7.2 3.6 virginica TRUE
## 3: 7.7 3.8 virginica TRUE
## 4: 7.9 3.8 virginica TRUE
# The 'keyed' data.table DT
DT <- data.table(A = letters[c(2, 1, 2, 3, 1, 2, 3)],
B = c(5, 4, 1, 9, 8, 8, 6),
C = 6:12)
setkey(DT, A, B)
str(DT)
## Classes 'data.table' and 'data.frame': 7 obs. of 3 variables:
## $ A: chr "a" "a" "b" "b" ...
## $ B: num 4 8 1 5 8 6 9
## $ C: int 7 10 8 6 11 12 9
## - attr(*, ".internal.selfref")=<externalptr>
## - attr(*, "sorted")= chr "A" "B"
DT
## A B C
## 1: a 4 7
## 2: a 8 10
## 3: b 1 8
## 4: b 5 6
## 5: b 8 11
## 6: c 6 12
## 7: c 9 9
# Select the "b" group
DT["b"]
## A B C
## 1: b 1 8
## 2: b 5 6
## 3: b 8 11
# "b" and "c" groups
DT[c("b", "c")]
## A B C
## 1: b 1 8
## 2: b 5 6
## 3: b 8 11
## 4: c 6 12
## 5: c 9 9
# The first row of the "b" and "c" groups
DT[c("b", "c"), mult = "first"]
## A B C
## 1: b 1 8
## 2: c 6 12
# First and last row of the "b" and "c" groups
DT[c("b", "c"), .SD[c(1, .N)], by = .EACHI]
## A B C
## 1: b 1 8
## 2: b 8 11
## 3: c 6 12
## 4: c 9 9
# Copy and extend code for instruction 4: add printout
DT[c("b", "c"), { print(.SD); .SD[c(1, .N)] }, by = .EACHI]
## B C
## 1: 1 8
## 2: 5 6
## 3: 8 11
## B C
## 1: 6 12
## 2: 9 9
## A B C
## 1: b 1 8
## 2: b 8 11
## 3: c 6 12
## 4: c 9 9
# Keyed data.table DT
DT <- data.table(A = letters[c(2, 1, 2, 3, 1, 2, 3)],
B = c(5, 4, 1, 9, 8, 8, 6),
C = 6:12,
key = "A,B")
str(DT)
## Classes 'data.table' and 'data.frame': 7 obs. of 3 variables:
## $ A: chr "a" "a" "b" "b" ...
## $ B: num 4 8 1 5 8 6 9
## $ C: int 7 10 8 6 11 12 9
## - attr(*, "sorted")= chr "A" "B"
## - attr(*, ".internal.selfref")=<externalptr>
DT
## A B C
## 1: a 4 7
## 2: a 8 10
## 3: b 1 8
## 4: b 5 6
## 5: b 8 11
## 6: c 6 12
## 7: c 9 9
# Get the key of DT
key(DT)
## [1] "A" "B"
# Row where A == "b" and B == 6
DT[.("b", 6)]
## A B C
## 1: b 6 NA
# Return the prevailing row
DT[.("b", 6), roll=TRUE]
## A B C
## 1: b 6 6
# Return the nearest row
DT[.("b", 6), roll="nearest"]
## A B C
## 1: b 6 6
# Keyed data.table DT
DT <- data.table(A = letters[c(2, 1, 2, 3, 1, 2, 3)],
B = c(5, 4, 1, 9, 8, 8, 6),
C = 6:12,
key = "A,B")
str(DT)
## Classes 'data.table' and 'data.frame': 7 obs. of 3 variables:
## $ A: chr "a" "a" "b" "b" ...
## $ B: num 4 8 1 5 8 6 9
## $ C: int 7 10 8 6 11 12 9
## - attr(*, "sorted")= chr "A" "B"
## - attr(*, ".internal.selfref")=<externalptr>
DT
## A B C
## 1: a 4 7
## 2: a 8 10
## 3: b 1 8
## 4: b 5 6
## 5: b 8 11
## 6: c 6 12
## 7: c 9 9
# Print the sequence (-2):10 for the "b" group
DT[.("b", (-2):10)]
## A B C
## 1: b -2 NA
## 2: b -1 NA
## 3: b 0 NA
## 4: b 1 8
## 5: b 2 NA
## 6: b 3 NA
## 7: b 4 NA
## 8: b 5 6
## 9: b 6 NA
## 10: b 7 NA
## 11: b 8 11
## 12: b 9 NA
## 13: b 10 NA
# Add code: carry the prevailing values forwards
DT[.("b", (-2):10), roll=TRUE]
## A B C
## 1: b -2 NA
## 2: b -1 NA
## 3: b 0 NA
## 4: b 1 8
## 5: b 2 8
## 6: b 3 8
## 7: b 4 8
## 8: b 5 6
## 9: b 6 6
## 10: b 7 6
## 11: b 8 11
## 12: b 9 11
## 13: b 10 11
# Add code: carry the first observation backwards
DT[.("b", (-2):10), roll=TRUE, rollends=TRUE]
## A B C
## 1: b -2 8
## 2: b -1 8
## 3: b 0 8
## 4: b 1 8
## 5: b 2 8
## 6: b 3 8
## 7: b 4 8
## 8: b 5 6
## 9: b 6 6
## 10: b 7 6
## 11: b 8 11
## 12: b 9 11
## 13: b 10 11
Jeff Ryan, the creator of quantmod and organizer of the R/Finance conference, has developed xts and zoo to simplify working with time series data. The course will cover five areas (chapters):
“xts” stands for extensible time series. The core of each “xts” is a “zoo” object, consisting of a matrix plus an index.
There are a few special behaviors of xts:
The “xts” object can be de-constructed when needed:
Data usually already exists and needs to be “wrangled” in to a proper format for xts/zoo. The easiest way to convert is using as.xts(). You can coerce truly external data after loading it, and can also save data with Can also save with write.zoo(x, “file”).
Subsetting based on time is a particular strength of xts. xts supports ISO8601:2004 (the standard, “right way”, to unambiguously consider times):
xts allows for four methods of specifying dates or intervals:
Can also use some traditional R-like methods (since xts extends zoo, and zoo extends base R):
Can set the flag which.i = TRUE to get back the correct records (row numbers). For example, index <- x[“2007-06-26/2007-06-28”, which.i = TRUE].
Description of key behaviors when working with an xts object:
xts introduces a few relatives of the head() and tail() functionality. These are the first() and last() functions.
Math operations using xts - xts is a matrix - need to be careful about matrix operations. Math operations are run only on the intersection of items:
Merging time series is common. Merge (cbind, merge) combines by columns, but joining based on index.
Merge (rbind( combine by rows, though all rows must already have an index. Basically, the rbind MUST be used on a time series.
Missing data is common, and xts inherits all of the zoo methods for dealing with missing data. The locf is the “last observation carry forward” (latest value that is not NA) - called with na.locf:
The NA can be managed in several ways:
Lag operators and difference operations. Seasonality is a repeating pattern. There is often a need to compare seasonality – for example, compare Mondays. Stationarity refers to some bound of the series.
The lag() function will change the timestamp, so that (for example) today can be merged as last week:
The “one period lag first difference” is calculated as diff(x, lag=1, differences=1, arithmetic=TRUE, log=FALSE, na.pad=TRUE, . ).
There are two main approaches for applying functions on discrete periods or intervals:
Time series aggregation can also be handled by xts:
Time series data can also be managed in a “rolling” manner - discrete or continuous:
Internals of xts such as indices and timezones:
Final topics:
Example code includes (cached to avoid future internet calls):
library(xts)
library(zoo)
x <- matrix(data=1:4, ncol=2)
idx <- as.Date(c("2015-01-01", "2015-02-01"))
# Create the xts
X <- xts(x, order.by = idx)
# Decosntruct the xts
coredata(X, fmt=FALSE)
## [,1] [,2]
## [1,] 1 3
## [2,] 2 4
index(X)
## [1] "2015-01-01" "2015-02-01"
# Working with the sunspots data
data(sunspots)
class(sunspots)
## [1] "ts"
sunspots_xts <- as.xts(sunspots)
class(sunspots_xts)
## [1] "xts" "zoo"
head(sunspots_xts)
## [,1]
## Jan 1749 58.0
## Feb 1749 62.6
## Mar 1749 70.0
## Apr 1749 55.7
## May 1749 85.0
## Jun 1749 83.5
# Example from chapter #1
ex_matrix <- xts(matrix(data=c(1, 1, 1, 2, 2, 2), ncol=2),
order.by=as.Date(c("2016-06-01", "2016-06-02", "2016-06-03"))
)
core <- coredata(ex_matrix)
# View the structure of ex_matrix
str(ex_matrix)
## An 'xts' object on 2016-06-01/2016-06-03 containing:
## Data: num [1:3, 1:2] 1 1 1 2 2 2
## Indexed by objects of class: [Date] TZ: UTC
## xts Attributes:
## NULL
# Extract the 3rd observation of the 2nd column of ex_matrix
ex_matrix[3, 2]
## [,1]
## 2016-06-03 2
# Extract the 3rd observation of the 2nd column of core
core[3, 2]
## [1] 2
# Create the object data using 5 random numbers
data <- rnorm(5)
# Create dates as a Date class object starting from 2016-01-01
dates <- seq(as.Date("2016-01-01"), length = 5, by = "days")
# Use xts() to create smith
smith <- xts(x = data, order.by = dates)
# Create bday (1899-05-08) using a POSIXct date class object
bday <- as.POSIXct("1899-05-08")
# Create hayek and add a new attribute called born
hayek <- xts(x = data, order.by = dates, born = bday)
# Extract the core data of hayek
hayek_core <- coredata(hayek)
# View the class of hayek_core
class(hayek_core)
## [1] "matrix"
# Extract the index of hayek
hayek_index <- index(hayek)
# View the class of hayek_index
class(hayek_index)
## [1] "Date"
# Create dates
dates <- as.Date("2016-01-01") + 0:4
# Create ts_a
ts_a <- xts(x = 1:5, order.by = dates)
# Create ts_b
ts_b <- xts(x = 1:5, order.by = as.POSIXct(dates))
# Extract the rows of ts_a using the index of ts_b
ts_a[index(ts_b)]
## [,1]
## 2016-01-01 1
## 2016-01-02 2
## 2016-01-03 3
## 2016-01-04 4
## 2016-01-05 5
# Extract the rows of ts_b using the index of ts_a
ts_b[index(ts_a)]
## [,1]
data(austres)
# Convert austres to an xts object called au
au <- as.xts(austres)
# Convert your xts object (au) into a matrix am
am <- as.matrix(au)
# Convert the original austres into a matrix am2
am2 <- as.matrix(austres)
# Create dat by reading tmp_file
tmp_file <- "http://s3.amazonaws.com/assets.datacamp.com/production/course_1127/datasets/tmp_file.csv"
dat <- read.csv(tmp_file)
# Convert dat into xts
xts(dat, order.by = as.Date(rownames(dat), "%m/%d/%Y"))
## a b
## 2015-01-02 1 3
## 2015-02-03 2 4
# Read tmp_file using read.zoo
dat_zoo <- read.zoo(tmp_file, index.column = 0, sep = ",", format = "%m/%d/%Y")
# Convert dat_zoo to xts
dat_xts <- as.xts(dat_zoo)
# Convert sunspots to xts using as.xts(). Save this as sunspots_xts
sunspots_xts <- as.xts(sunspots)
# Get the temporary file name
tmp <- tempfile()
# Write the xts object using zoo to tmp
write.zoo(sunspots_xts, sep = ",", file = tmp)
# Read the tmp file. FUN = as.yearmon converts strings such as Jan 1749 into a proper time class
sun <- read.zoo(tmp, sep = ",", FUN = as.yearmon)
# Convert sun into xts. Save this as sun_xts
sun_xts <- as.xts(sun)
data(edhec, package="PerformanceAnalytics")
head(edhec["2007-01", 1])
## Convertible Arbitrage
## 2007-01-31 0.013
head(edhec["2007-01/2007-03", 1])
## Convertible Arbitrage
## 2007-01-31 0.0130
## 2007-02-28 0.0117
## 2007-03-31 0.0060
head(edhec["200701/03", 1])
## Convertible Arbitrage
## 2007-01-31 0.0130
## 2007-02-28 0.0117
## 2007-03-31 0.0060
first(edhec[, "Funds of Funds"], "4 months")
## Funds of Funds
## 1997-01-31 0.0317
## 1997-02-28 0.0106
## 1997-03-31 -0.0077
## 1997-04-30 0.0009
last(edhec[, "Funds of Funds"], "1 year")
## Funds of Funds
## 2009-01-31 0.0060
## 2009-02-28 -0.0037
## 2009-03-31 0.0008
## 2009-04-30 0.0092
## 2009-05-31 0.0312
## 2009-06-30 0.0024
## 2009-07-31 0.0153
## 2009-08-31 0.0113
Chapter 1 - Data cleaning and summarization - ggplot2, dplyr, real-world dataset
United Nations dataset - voting history, from a scenario where every nation gets a vote:
Grouping and Summarizing - make the dataset manageable:
Sorting and filtering summarized data:
Example code includes:
# Grab only the sessions that are even numbered, then double-check that the list is unique by rcid
evenSessions <- unvotes::un_roll_calls %>%
filter(session %% 2 == 0)
nrow(evenSessions) == nrow(evenSessions %>%
select(rcid) %>%
distinct()
)
## [1] TRUE
# Double check that un_votes is unique on rcid-country, then inner_join the evenSessions file
nrow(unvotes::un_votes) == nrow(unvotes::un_votes %>%
select(rcid, country) %>%
distinct()
)
## [1] TRUE
baseData <- unvotes::un_votes %>%
inner_join(evenSessions, by="rcid") %>%
select(rcid, session, vote, country)
str(baseData)
## Classes 'tbl_df', 'tbl' and 'data.frame': 353720 obs. of 4 variables:
## $ rcid : atomic 46 46 46 46 46 46 46 46 46 46 ...
## ..- attr(*, "comment")= chr "rcid"
## $ session: num 2 2 2 2 2 2 2 2 2 2 ...
## $ vote : Factor w/ 3 levels "abstain","no",..: 3 3 3 2 3 3 1 3 2 3 ...
## $ country: chr "Paraguay" "Honduras" "Luxembourg" "Poland" ...
# Create the 1-2-3 system where 1=yes, 2=abstain, and 3=no
chrVotes <- as.character(baseData$vote)
fctVotes <- factor(chrVotes, levels=c("yes", "abstain", "no"))
intVotes <- as.integer(fctVotes)
table(chrVotes, intVotes) # confirm that 1=yes, 2=abstain, 3=no
## intVotes
## chrVotes 1 2 3
## abstain 0 45444 0
## no 0 0 25344
## yes 282932 0 0
baseData <- baseData %>%
mutate(oldFctVote = vote, vote=intVotes)
str(baseData) # 353,720 x 4
## Classes 'tbl_df', 'tbl' and 'data.frame': 353720 obs. of 5 variables:
## $ rcid : atomic 46 46 46 46 46 46 46 46 46 46 ...
## ..- attr(*, "comment")= chr "rcid"
## $ session : num 2 2 2 2 2 2 2 2 2 2 ...
## $ vote : int 1 1 1 3 1 1 2 1 3 1 ...
## $ country : chr "Paraguay" "Honduras" "Luxembourg" "Poland" ...
## $ oldFctVote: Factor w/ 3 levels "abstain","no",..: 3 3 3 2 3 3 1 3 2 3 ...
# Create the full table of all combinations of rcid-session x country
# (so that votes can be entered there as either 9-not member or 8-not present)
uqVotes <- distinct(baseData[,c("rcid", "session")]) # 2,590 x 2
uqCountry <- distinct(baseData[,c("country"),drop=FALSE]) # 200x1
uqVotes$dummy <- 1L
uqCountry$dummy <- 1L
uqVoteCountry <- full_join(uqVotes, uqCountry, by="dummy") # 518,000 x 4 (rcid-session-dummy-country)
missVoteCountry <- uqVoteCountry %>%
select(-dummy) %>%
setdiff(select(baseData, -vote, -oldFctVote)) # 164,280 x 3 (rcid-session-country)
# Create the unique list of session-country
# (countries that voted at least once in a session will be assumed
# to have been not members at any votes missed in that session)
uqSessionCountry <- baseData %>%
select(session, country) %>%
distinct() # 4,744 x 2
nmVoteCountry <- missVoteCountry %>%
anti_join(uqSessionCountry, by=c("session", "country")) # 132,147 x 3 (rcid-session-country)
npVoteCountry <- missVoteCountry %>%
semi_join(uqSessionCountry, by=c("session", "country")) # 32,133 x 3 (rcid-session-country)
# Bind the rows together, noting their sources for the record
unvotes <- bind_rows(baseData,
mutate(nmVoteCountry, vote=9, oldFctVote=NA),
mutate(npVoteCountry, vote=8, oldFctVote=NA),
.id="source"
) # 518,000 x 6 (source-rcid-session-vote-country-oldFctVote)
# Put the UN code on them (the unvotes datauses the Correlates of War Number, variable "cown")
missCountry <- uqCountry %>%
select(-dummy) %>%
anti_join(countrycode::countrycode_data, by=c("country" = "country.name.en"))
reMap <- c(
"Bolivia, Plurinational State of"="Bolivia (Plurinational State of)",
"Congo, the Democratic Republic of the"="Democratic Republic of the Congo",
"Cote d'Ivoire"="Côte D'Ivoire",
"Gambia"="Gambia (Islamic Republic of the)",
"Guinea-Bissau"="Guinea Bissau",
"Iran, Islamic Republic of"="Iran (Islamic Republic of)",
"Korea, Democratic People's Republic of"="Democratic People's Republic of Korea",
"Korea, Republic of"="Republic of Korea",
"Macedonia, the former Yugoslav Republic of"="The former Yugoslav Republic of Macedonia",
"Micronesia, Federated States of"="Micronesia (Federated States of)",
"Moldova, Republic of"="Republic of Moldova",
"Tanzania, United Republic of"="United Republic of Tanzania",
"United Kingdom"="United Kingdom of Great Britain and Northern Ireland",
"United States"="United States of America"
)
mapMissCountry <- missCountry %>%
mutate(newCountry=reMap[country]) %>%
left_join(select(countrycode::countrycode_data, country.name.en, iso3n, un, cown),
by=c("newCountry" = "country.name.en")
)
mapOKCountry <- uqCountry %>%
select(-dummy) %>%
inner_join(select(countrycode::countrycode_data, country.name.en, iso3n, un, cown),
by=c("country" = "country.name.en")
)
mapCountry <- mapMissCountry %>%
select(country, iso3n, un, cown) %>%
bind_rows(mapOKCountry) # 200 x 2
mapCountry[duplicated(mapCountry$cown), ] # no countries
## # A tibble: 0 × 4
## # ... with 4 variables: country <chr>, iso3n <int>, un <int>, cown <int>
# Place the cown code on the unvotes dataset as ccode, and delete the records where it is NA
unvotes %>%
anti_join(mapCountry, by=c("country")) # None, as it should be
## # A tibble: 0 × 6
## # ... with 6 variables: source <chr>, rcid <dbl>, session <dbl>,
## # vote <dbl>, country <chr>, oldFctVote <fctr>
unvotes_tmp <- unvotes %>%
left_join(mapCountry, by=c("country"))
votes <- unvotes_tmp %>%
filter(!is.na(cown)) %>%
mutate(ccode=cown) %>%
select(rcid, session, vote, ccode) # 518,000 (200 iso3n x 2,590 votes) x 4 (rcid-session-vote-ccode)
# Now can actually run the process on the newly created "votes" dataset
# Print the votes dataset
votes
## # A tibble: 515,410 × 4
## rcid session vote ccode
## <dbl> <dbl> <dbl> <int>
## 1 46 2 1 150
## 2 46 2 1 91
## 3 46 2 1 212
## 4 46 2 3 290
## 5 46 2 1 900
## 6 46 2 1 140
## 7 46 2 2 530
## 8 46 2 1 840
## 9 46 2 3 365
## 10 46 2 1 160
## # ... with 515,400 more rows
# Filter for only votes that are "yes", "abstain", or "no"
votes %>% filter(vote <= 3)
## # A tibble: 351,529 × 4
## rcid session vote ccode
## <dbl> <dbl> <dbl> <int>
## 1 46 2 1 150
## 2 46 2 1 91
## 3 46 2 1 212
## 4 46 2 3 290
## 5 46 2 1 900
## 6 46 2 1 140
## 7 46 2 2 530
## 8 46 2 1 840
## 9 46 2 3 365
## 10 46 2 1 160
## # ... with 351,519 more rows
# Add another %>% step to add a year column
votes %>%
filter(vote <= 3) %>%
mutate(year=1945+session)
## # A tibble: 351,529 × 5
## rcid session vote ccode year
## <dbl> <dbl> <dbl> <int> <dbl>
## 1 46 2 1 150 1947
## 2 46 2 1 91 1947
## 3 46 2 1 212 1947
## 4 46 2 3 290 1947
## 5 46 2 1 900 1947
## 6 46 2 1 140 1947
## 7 46 2 2 530 1947
## 8 46 2 1 840 1947
## 9 46 2 3 365 1947
## 10 46 2 1 160 1947
## # ... with 351,519 more rows
# Convert country code 100
countrycode::countrycode(100, "cown", "country.name")
## [1] "Colombia"
# Add a country column within the mutate: votes_processed
votes_processed <- votes %>%
filter(vote <= 3) %>%
mutate(year = session + 1945,
country = countrycode::countrycode(ccode, "cown", "country.name")
)
# Print votes_processed
votes_processed
## # A tibble: 351,529 × 6
## rcid session vote ccode year country
## <dbl> <dbl> <dbl> <int> <dbl> <chr>
## 1 46 2 1 150 1947 Paraguay
## 2 46 2 1 91 1947 Honduras
## 3 46 2 1 212 1947 Luxembourg
## 4 46 2 3 290 1947 Poland
## 5 46 2 1 900 1947 Australia
## 6 46 2 1 140 1947 Brazil
## 7 46 2 2 530 1947 Ethiopia
## 8 46 2 1 840 1947 Philippines
## 9 46 2 3 365 1947 Russian Federation
## 10 46 2 1 160 1947 Argentina
## # ... with 351,519 more rows
# Find total and fraction of "yes" votes
votes_processed %>%
summarize(total=n(), percent_yes=mean(vote==1))
## # A tibble: 1 × 2
## total percent_yes
## <int> <dbl>
## 1 351529 0.7997719
# Change this code to summarize by year
votes_processed %>%
group_by(year) %>%
summarize(total = n(),
percent_yes = mean(vote == 1)
)
## # A tibble: 34 × 3
## year total percent_yes
## <dbl> <int> <dbl>
## 1 1947 2039 0.5693968
## 2 1949 3469 0.4375901
## 3 1951 1434 0.5850767
## 4 1953 1537 0.6317502
## 5 1955 2169 0.6947902
## 6 1957 2708 0.6085672
## 7 1959 4326 0.5880721
## 8 1961 7417 0.5726035
## 9 1963 3277 0.7296308
## 10 1965 4341 0.7065192
## # ... with 24 more rows
# Summarize by country: by_country
by_country <- votes_processed %>%
group_by(country) %>%
summarize(total = n(),
percent_yes = mean(vote == 1)
)
# Print the by_country dataset
by_country
## # A tibble: 199 × 3
## country total percent_yes
## <chr> <int> <dbl>
## 1 Afghanistan 2373 0.8592499
## 2 Albania 1696 0.7169811
## 3 Algeria 2214 0.8992773
## 4 Andorra 720 0.6375000
## 5 Angola 1432 0.9238827
## 6 Antigua and Barbuda 1303 0.9125096
## 7 Argentina 2554 0.7678152
## 8 Armenia 758 0.7467018
## 9 Australia 2576 0.5562888
## 10 Austria 2390 0.6221757
## # ... with 189 more rows
# Sort in ascending order of percent_yes
by_country %>%
arrange(percent_yes)
## # A tibble: 199 × 3
## country total percent_yes
## <chr> <int> <dbl>
## 1 Zanzibar 2 0.0000000
## 2 United States of America 2569 0.2693655
## 3 Palau 370 0.3378378
## 4 Israel 2381 0.3406132
## 5 Federal Republic of Germany 1075 0.3972093
## 6 United Kingdom of Great Britain and Northern Ireland 2559 0.4165690
## 7 France 2528 0.4264241
## 8 Micronesia (Federated States of) 724 0.4419890
## 9 Marshall Islands 757 0.4914135
## 10 Belgium 2569 0.4920202
## # ... with 189 more rows
# Now sort in descending order
by_country %>%
arrange(-percent_yes)
## # A tibble: 199 × 3
## country total percent_yes
## <chr> <int> <dbl>
## 1 Sao Tome and Principe 1091 0.9761687
## 2 Seychelles 882 0.9750567
## 3 Djibouti 1599 0.9612258
## 4 Guinea Bissau 1539 0.9603639
## 5 Timor-Leste 327 0.9571865
## 6 Mauritius 1832 0.9497817
## 7 Zimbabwe 1362 0.9493392
## 8 Comoros 1134 0.9470899
## 9 United Arab Emirates 1935 0.9467700
## 10 Mozambique 1702 0.9465335
## # ... with 189 more rows
# Filter out countries with fewer than 100 votes
by_country %>%
arrange(percent_yes) %>%
filter(total >= 100)
## # A tibble: 196 × 3
## country total percent_yes
## <chr> <int> <dbl>
## 1 United States of America 2569 0.2693655
## 2 Palau 370 0.3378378
## 3 Israel 2381 0.3406132
## 4 Federal Republic of Germany 1075 0.3972093
## 5 United Kingdom of Great Britain and Northern Ireland 2559 0.4165690
## 6 France 2528 0.4264241
## 7 Micronesia (Federated States of) 724 0.4419890
## 8 Marshall Islands 757 0.4914135
## 9 Belgium 2569 0.4920202
## 10 Canada 2577 0.5079550
## # ... with 186 more rows
Chapter 2 - Visualization with ggplot2
General ggplot2 background - better exploration of the trends over time:
Visualizing by country - see for an individual country or groups of countries:
Faceting to show multiple plots:
Example code includes:
# Define by_year
by_year <- votes_processed %>%
group_by(year) %>%
summarize(total = n(),
percent_yes = mean(vote == 1)
)
# Load the ggplot2 package
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.5
# Create line plot
ggplot(by_year, aes(x=year, y=percent_yes)) +
geom_line()
ggplot(by_year, aes(year, percent_yes)) +
geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'loess'
# Group by year and country: by_year_country
by_year_country <- votes_processed %>%
group_by(year, country) %>%
summarize(total = n(),
percent_yes = mean(vote == 1)
)
# Print by_year_country
by_year_country
## Source: local data frame [4,717 x 4]
## Groups: year [?]
##
## year country total percent_yes
## <dbl> <chr> <int> <dbl>
## 1 1947 Afghanistan 34 0.3823529
## 2 1947 Argentina 38 0.5789474
## 3 1947 Australia 38 0.5526316
## 4 1947 Belarus 38 0.5000000
## 5 1947 Belgium 38 0.6052632
## 6 1947 Bolivia (Plurinational State of) 37 0.5945946
## 7 1947 Brazil 38 0.6578947
## 8 1947 Canada 38 0.6052632
## 9 1947 Chile 38 0.6578947
## 10 1947 Colombia 35 0.5428571
## # ... with 4,707 more rows
# Create a filtered version: UK_by_year
UK_by_year <- by_year_country %>%
filter(country == "United Kingdom of Great Britain and Northern Ireland")
# Line plot of percent_yes over time for UK only
ggplot(UK_by_year, aes(x=year, y=percent_yes)) + geom_line()
# Vector of four countries to examine
countries <- c("United States of America",
"United Kingdom of Great Britain and Northern Ireland",
"France",
"India"
)
# Filter by_year_country: filtered_4_countries
filtered_4_countries <- by_year_country %>%
filter(country %in% countries)
# Line plot of % yes in four countries
ggplot(filtered_4_countries, aes(x=year, y=percent_yes, color=country)) +
geom_line()
countries <- c("United States of America",
"United Kingdom of Great Britain and Northern Ireland",
"France",
"Japan",
"Brazil",
"India"
)
# Filtered by_year_country: filtered_6_countries
filtered_6_countries <- by_year_country %>%
filter(country %in% countries)
# Line plot of % yes over time faceted by country
ggplot(filtered_6_countries, aes(x=year, y=percent_yes)) +
geom_line() +
facet_wrap(~ country)
ggplot(filtered_6_countries, aes(year, percent_yes)) +
geom_line() +
facet_wrap(~ country, scale="free_y")
countries <- c("United States of America",
"United Kingdom of Great Britain and Northern Ireland",
"France",
"Japan",
"Brazil",
"India",
"Canada",
"Mexico",
"Israel"
)
# Filtered by_year_country: filtered_countries
filtered_countries <- by_year_country %>%
filter(country %in% countries)
# Line plot of % yes over time faceted by country
ggplot(filtered_countries, aes(year, percent_yes)) +
geom_line() +
facet_wrap(~ country, scales = "free_y")
Chapter 3 - Tidy modeling with broom
Linear regression - quantifying trends (best-fit-lines):
Tidying models with broom:
Nesting for multiple models:
Fitting multiple models to the nested data:
Working with many tidy models:
Example code includes:
# Percentage of yes votes from the US by year: US_by_year
US_by_year <- by_year_country %>%
filter(country == "United States of America")
# Print the US_by_year data
US_by_year
## Source: local data frame [34 x 4]
## Groups: year [34]
##
## year country total percent_yes
## <dbl> <chr> <int> <dbl>
## 1 1947 United States of America 38 0.7105263
## 2 1949 United States of America 64 0.2812500
## 3 1951 United States of America 25 0.4000000
## 4 1953 United States of America 26 0.5000000
## 5 1955 United States of America 37 0.6216216
## 6 1957 United States of America 34 0.6470588
## 7 1959 United States of America 54 0.4259259
## 8 1961 United States of America 75 0.5066667
## 9 1963 United States of America 32 0.5000000
## 10 1965 United States of America 41 0.3658537
## # ... with 24 more rows
# Perform a linear regression of percent_yes by year: US_fit
US_fit <- lm(percent_yes ~ year, data=US_by_year)
# Perform summary() on the US_fit object
summary(US_fit)
##
## Call:
## lm(formula = percent_yes ~ year, data = US_by_year)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.222557 -0.080540 -0.008592 0.081983 0.194232
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.6724804 1.8378722 6.895 8.36e-08 ***
## year -0.0062435 0.0009282 -6.727 1.35e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1062 on 32 degrees of freedom
## Multiple R-squared: 0.5858, Adjusted R-squared: 0.5728
## F-statistic: 45.25 on 1 and 32 DF, p-value: 1.348e-07
# Call the tidy() function on the US_fit object
broom::tidy(US_fit)
## term estimate std.error statistic p.value
## 1 (Intercept) 12.672480427 1.8378722388 6.895191 8.360047e-08
## 2 year -0.006243547 0.0009281727 -6.726708 1.347828e-07
# Linear regression of percent_yes by year for US
US_by_year <- by_year_country %>%
filter(country == "United States of America")
US_fit <- lm(percent_yes ~ year, US_by_year)
# Fit model for the United Kingdom
UK_by_year <- by_year_country %>%
filter(country == "United Kingdom of Great Britain and Northern Ireland")
UK_fit <- lm(percent_yes ~ year, UK_by_year)
# Create US_tidied and UK_tidied
US_tidied <- broom::tidy(US_fit)
UK_tidied <- broom::tidy(UK_fit)
# Combine the two tidied models
bind_rows(US_tidied, UK_tidied)
## term estimate std.error statistic p.value
## 1 (Intercept) 12.672480427 1.8378722388 6.895191 8.360047e-08
## 2 year -0.006243547 0.0009281727 -6.726708 1.347828e-07
## 3 (Intercept) -3.237477542 1.9542206633 -1.656659 1.073629e-01
## 4 year 0.001854637 0.0009869317 1.879195 6.935175e-02
# Seems like a HORRIBLE function; messed up all the data unless it was 1) ungrouped, and 2) arranged by the planned nesting variables
# Nest all columns besides country
by_year_country %>%
ungroup() %>%
arrange(country) %>%
tidyr::nest(-country)
## # A tibble: 199 × 2
## country data
## <chr> <list>
## 1 Afghanistan <tibble [34 × 3]>
## 2 Albania <tibble [29 × 3]>
## 3 Algeria <tibble [26 × 3]>
## 4 Andorra <tibble [11 × 3]>
## 5 Angola <tibble [19 × 3]>
## 6 Antigua and Barbuda <tibble [17 × 3]>
## 7 Argentina <tibble [34 × 3]>
## 8 Armenia <tibble [12 × 3]>
## 9 Australia <tibble [34 × 3]>
## 10 Austria <tibble [29 × 3]>
## # ... with 189 more rows
nested <- by_year_country %>%
ungroup() %>%
arrange(country) %>%
tidyr::nest(-country)
# Print the nested data for Brazil
nested$data[nested$country == "Brazil"]
## [[1]]
## # A tibble: 34 × 3
## year total percent_yes
## <dbl> <int> <dbl>
## 1 1947 38 0.6578947
## 2 1949 64 0.4687500
## 3 1951 25 0.6400000
## 4 1953 26 0.7307692
## 5 1955 37 0.7297297
## 6 1957 34 0.7352941
## 7 1959 54 0.5370370
## 8 1961 76 0.5526316
## 9 1963 32 0.7812500
## 10 1965 41 0.6097561
## # ... with 24 more rows
# Unnest the data column to return it to its original form
tidyr::unnest(nested, data)
## # A tibble: 4,717 × 4
## country year total percent_yes
## <chr> <dbl> <int> <dbl>
## 1 Afghanistan 1947 34 0.3823529
## 2 Afghanistan 1949 51 0.6078431
## 3 Afghanistan 1951 25 0.7600000
## 4 Afghanistan 1953 26 0.7692308
## 5 Afghanistan 1955 37 0.7297297
## 6 Afghanistan 1957 34 0.5294118
## 7 Afghanistan 1959 54 0.6111111
## 8 Afghanistan 1961 76 0.6052632
## 9 Afghanistan 1963 32 0.7812500
## 10 Afghanistan 1965 40 0.8500000
## # ... with 4,707 more rows
# Perform a linear regression on each item in the data column
mdls <- purrr::map(nested$data, ~ lm(percent_yes ~ year, .))
nested %>%
mutate(model = mdls)
## # A tibble: 199 × 3
## country data model
## <chr> <list> <list>
## 1 Afghanistan <tibble [34 × 3]> <S3: lm>
## 2 Albania <tibble [29 × 3]> <S3: lm>
## 3 Algeria <tibble [26 × 3]> <S3: lm>
## 4 Andorra <tibble [11 × 3]> <S3: lm>
## 5 Angola <tibble [19 × 3]> <S3: lm>
## 6 Antigua and Barbuda <tibble [17 × 3]> <S3: lm>
## 7 Argentina <tibble [34 × 3]> <S3: lm>
## 8 Armenia <tibble [12 × 3]> <S3: lm>
## 9 Australia <tibble [34 × 3]> <S3: lm>
## 10 Austria <tibble [29 × 3]> <S3: lm>
## # ... with 189 more rows
# This one errors out for some reason (only in knitr, not in the console)
# nested %>%
# mutate(model = purrr::map(data, ~ lm(percent_yes ~ year, .)))
# Add another mutate that applies tidy() to each model
tidyModel <- purrr::map(mdls, ~broom::tidy(.))
nested %>%
mutate(model = mdls) %>%
mutate(tidied = tidyModel)
## # A tibble: 199 × 4
## country data model tidied
## <chr> <list> <list> <list>
## 1 Afghanistan <tibble [34 × 3]> <S3: lm> <data.frame [2 × 5]>
## 2 Albania <tibble [29 × 3]> <S3: lm> <data.frame [2 × 5]>
## 3 Algeria <tibble [26 × 3]> <S3: lm> <data.frame [2 × 5]>
## 4 Andorra <tibble [11 × 3]> <S3: lm> <data.frame [2 × 5]>
## 5 Angola <tibble [19 × 3]> <S3: lm> <data.frame [2 × 5]>
## 6 Antigua and Barbuda <tibble [17 × 3]> <S3: lm> <data.frame [2 × 5]>
## 7 Argentina <tibble [34 × 3]> <S3: lm> <data.frame [2 × 5]>
## 8 Armenia <tibble [12 × 3]> <S3: lm> <data.frame [2 × 5]>
## 9 Australia <tibble [34 × 3]> <S3: lm> <data.frame [2 × 5]>
## 10 Austria <tibble [29 × 3]> <S3: lm> <data.frame [2 × 5]>
## # ... with 189 more rows
# This one errors out for some reason (only in knitr, not in the console)
# nested %>%
# mutate(model = purrr::map(data, ~ lm(percent_yes ~ year, data = .))) %>%
# mutate(tidied = purrr::map(model, ~ broom::tidy(.)))
# Add one more step that unnests the tidied column
country_coefficients <- nested %>%
mutate(model = mdls,
tidied = tidyModel
) %>%
tidyr::unnest(tidied)
# Samer erroring out issue in knitr . . .
# country_coefficients <- nested %>%
# mutate(model = purrr::map(data, ~ lm(percent_yes ~ year, data = .)),
# tidied = purrr::map(model, broom::tidy)
# ) %>%
# tidyr::unnest(tidied)
# Print the resulting country_coefficients variable
country_coefficients
## # A tibble: 397 × 6
## country term estimate std.error statistic
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 Afghanistan (Intercept) -11.063084650 1.4705189228 -7.52325215
## 2 Afghanistan year 0.006009299 0.0007426499 8.09169837
## 3 Albania (Intercept) 3.360559305 3.3085971803 1.01570518
## 4 Albania year -0.001345460 0.0016667404 -0.80724056
## 5 Algeria (Intercept) -5.461121731 1.7452792997 -3.12908182
## 6 Algeria year 0.003193022 0.0008778821 3.63718725
## 7 Andorra (Intercept) -0.358359014 4.8835752846 -0.07338046
## 8 Andorra year 0.000493452 0.0024381183 0.20239049
## 9 Angola (Intercept) 3.093752452 2.0124923762 1.53727412
## 10 Angola year -0.001090811 0.0010087529 -1.08134636
## # ... with 387 more rows, and 1 more variables: p.value <dbl>
# Print the country_coefficients dataset
country_coefficients
## # A tibble: 397 × 6
## country term estimate std.error statistic
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 Afghanistan (Intercept) -11.063084650 1.4705189228 -7.52325215
## 2 Afghanistan year 0.006009299 0.0007426499 8.09169837
## 3 Albania (Intercept) 3.360559305 3.3085971803 1.01570518
## 4 Albania year -0.001345460 0.0016667404 -0.80724056
## 5 Algeria (Intercept) -5.461121731 1.7452792997 -3.12908182
## 6 Algeria year 0.003193022 0.0008778821 3.63718725
## 7 Andorra (Intercept) -0.358359014 4.8835752846 -0.07338046
## 8 Andorra year 0.000493452 0.0024381183 0.20239049
## 9 Angola (Intercept) 3.093752452 2.0124923762 1.53727412
## 10 Angola year -0.001090811 0.0010087529 -1.08134636
## # ... with 387 more rows, and 1 more variables: p.value <dbl>
# Filter for only the slope terms
country_coefficients %>%
filter(term == "year")
## # A tibble: 198 × 6
## country term estimate std.error statistic
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 Afghanistan year 0.006009299 0.0007426499 8.0916984
## 2 Albania year -0.001345460 0.0016667404 -0.8072406
## 3 Algeria year 0.003193022 0.0008778821 3.6371873
## 4 Andorra year 0.000493452 0.0024381183 0.2023905
## 5 Angola year -0.001090811 0.0010087529 -1.0813464
## 6 Antigua and Barbuda year 0.001079916 0.0010590399 1.0197121
## 7 Argentina year 0.005152270 0.0010610352 4.8558902
## 8 Armenia year -0.003570723 0.0035892632 -0.9948346
## 9 Australia year 0.002553740 0.0010859947 2.3515218
## 10 Austria year 0.002840993 0.0008664018 3.2790715
## # ... with 188 more rows, and 1 more variables: p.value <dbl>
# Filter for only the slope terms
slope_terms <- country_coefficients %>%
filter(term == "year")
# Add p.adjusted column, then filter
slope_terms %>%
mutate(p.adjusted = p.adjust(p.value)) %>%
filter(p.adjusted < 0.05)
## # A tibble: 62 × 7
## country term estimate std.error
## <chr> <chr> <dbl> <dbl>
## 1 Afghanistan year 0.006009299 0.0007426499
## 2 Argentina year 0.005152270 0.0010610352
## 3 Barbados year 0.005616368 0.0013347331
## 4 Belarus year 0.003912506 0.0007585622
## 5 Belgium year 0.003186372 0.0007630472
## 6 Bolivia (Plurinational State of) year 0.005803654 0.0009657579
## 7 Brazil year 0.006108871 0.0008167495
## 8 Cambodia year 0.006792013 0.0011544253
## 9 Central African Republic year 0.005567740 0.0013039928
## 10 Chile year 0.006776937 0.0008220202
## # ... with 52 more rows, and 3 more variables: statistic <dbl>,
## # p.value <dbl>, p.adjusted <dbl>
# Filter by adjusted p-values
filtered_countries <- country_coefficients %>%
filter(term == "year") %>%
mutate(p.adjusted = p.adjust(p.value)) %>%
filter(p.adjusted < .05)
# Sort for the countries increasing most quickly
filtered_countries %>%
arrange(desc(estimate))
## # A tibble: 62 × 7
## country term estimate std.error statistic
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 South Africa year 0.011861365 0.0014004289 8.469809
## 2 Kazakhstan year 0.010955741 0.0019482401 5.623404
## 3 Yemen Arab Republic year 0.010854882 0.0015869058 6.840281
## 4 Kyrgyzstan year 0.009725462 0.0009884060 9.839541
## 5 Malawi year 0.009087765 0.0018112478 5.017406
## 6 Dominican Republic year 0.008055482 0.0009138578 8.814809
## 7 Portugal year 0.007996968 0.0017114569 4.672609
## 8 Honduras year 0.007721191 0.0009211022 8.382556
## 9 Peru year 0.007301189 0.0009763560 7.477999
## 10 Nicaragua year 0.007077883 0.0010715994 6.604971
## # ... with 52 more rows, and 2 more variables: p.value <dbl>,
## # p.adjusted <dbl>
# Sort for the countries decreasing most quickly
filtered_countries %>%
arrange(estimate)
## # A tibble: 62 × 7
## country term estimate std.error statistic
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 Republic of Korea year -0.009256217 0.0015111366 -6.125334
## 2 Israel year -0.006859149 0.0011717864 -5.853583
## 3 United States of America year -0.006243547 0.0009281727 -6.726708
## 4 Belgium year 0.003186372 0.0007630472 4.175852
## 5 Guinea year 0.003623915 0.0008324455 4.353336
## 6 Morocco year 0.003800921 0.0008601889 4.418705
## 7 Belarus year 0.003912506 0.0007585622 5.157792
## 8 Iran (Islamic Republic of) year 0.003914836 0.0008554901 4.576133
## 9 Congo year 0.003967778 0.0009220262 4.303324
## 10 Sudan year 0.003991321 0.0009613509 4.151784
## # ... with 52 more rows, and 2 more variables: p.value <dbl>,
## # p.adjusted <dbl>
Chapter 4 - Joining and Tidying
Joining datasets - bringing in the descriptions for each type of roll call vote:
Tidy data - creating graphs faceted by issue, and with lines for a few key countries:
Tidy modeling by topic and country - running linear models by country and topic:
Example code includes:
# The dataset unvotes::un_roll_call_issues is 4,951 x 3 [rcid-short_name-issue]
str(unvotes::un_roll_call_issues) # 4,951x3
## Classes 'tbl_df', 'tbl' and 'data.frame': 4951 obs. of 3 variables:
## $ rcid : num 30 34 77 9002 9003 ...
## $ short_name: chr "me" "me" "me" "me" ...
## $ issue : chr "Palestinian conflict" "Palestinian conflict" "Palestinian conflict" "Palestinian conflict" ...
table(unvotes::un_roll_call_issues$short_name) # Has the 6 key issues we are seeking
##
## co di ec hr me nu
## 971 859 461 901 1047 712
sum(table(unvotes::un_roll_call_issues$short_name)) # 4,951
## [1] 4951
nrow(distinct(select(unvotes::un_roll_call_issues, rcid))) # 3,813 (there are duplicates by rcid)
## [1] 3813
tmpData <- unvotes::un_roll_call_issues %>%
mutate(dummy=1) %>%
select(rcid, short_name, dummy) %>%
tidyr::spread(key=short_name, value=dummy, fill=0)
str(tmpData) # 3,813 x 7 (rcid-6 issues)
## Classes 'tbl_df', 'tbl' and 'data.frame': 3813 obs. of 7 variables:
## $ rcid: num 6 8 11 18 19 24 26 27 28 29 ...
## $ co : num 0 0 1 0 0 0 1 1 1 1 ...
## $ di : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ec : num 0 1 0 1 1 1 0 0 0 0 ...
## $ hr : num 1 0 0 0 0 0 0 0 0 0 ...
## $ me : num 0 0 0 0 0 0 0 0 0 0 ...
## $ nu : num 0 0 0 0 0 0 0 0 0 0 ...
tmpData %>%
select(-rcid) %>%
rowSums() %>%
table() # 2,836 are 1 ; 816 are 2 ; 161 are 3
## .
## 1 2 3
## 2836 816 161
# The dataset unvotes::un_roll_call_issues is 5,356 x 9 [rcid-session-importantvote-date-unres-amend-para-short-descr]
str(unvotes::un_roll_calls) # 5,356 x 9
## Classes 'tbl_df', 'tbl' and 'data.frame': 5356 obs. of 9 variables:
## $ rcid : num 3 4 5 6 7 8 9 10 11 12 ...
## $ session : num 1 1 1 1 1 1 1 1 1 1 ...
## $ importantvote: num 0 0 0 0 0 0 0 0 0 0 ...
## $ date : Date, format: "1946-01-01" "1946-01-02" ...
## $ unres : chr "R/1/66" "R/1/79" "R/1/98" "R/1/107" ...
## $ amend : num 1 0 0 0 1 1 0 1 0 1 ...
## $ para : num 0 0 0 0 0 0 0 1 0 1 ...
## $ short : chr "AMENDMENTS, RULES OF PROCEDURE" "SECURITY COUNCIL ELECTIONS" "VOTING PROCEDURE" "DECLARATION OF HUMAN RIGHTS" ...
## $ descr : chr "TO ADOPT A CUBAN AMENDMENT TO THE UK PROPOSAL REFERRING THE PROVISIONAL RULES OF PROCEDURE AND ANY AMENDMENTS THEREOF TO THE 6T"| __truncated__ "TO ADOPT A USSR PROPOSAL ADJOURNING DEBATE ON AND POSTPONINGELECTIONS OF THE NON-PERMANENT MEMBERS OF THE SECURITY COUNCIL, TO "| __truncated__ "TO ADOPT THE KOREAN PROPOSAL THAT INVALID BALLOTS BE INCLUDED IN THE TOTAL NUMBER OF \\MEMBERS PRESENT AND VOTING\\\\, IN CALCU"| __truncated__ "TO ADOPT A CUBAN PROPOSAL (A/3-C) THAT AN ITEM ON A DECLARATION OF THE RIGHTS AND DUTIES OF MAN BE TABLED." ...
nrow(distinct(select(unvotes::un_roll_calls, rcid))) == nrow(unvotes::un_roll_calls) # TRUE (no duplicates)
## [1] TRUE
# Combine the datasets to create "descriptions" which should have 10 columns (rcid-session-date-unres-6 numerics)
# The dataset "descriptions" should have only the even numbered sessions
descriptions <- unvotes::un_roll_calls %>%
select(rcid, session, date, unres) %>%
left_join(tmpData, by="rcid") %>%
filter(session %% 2 == 0)
numVars <- c("me", "nu", "di", "hr", "co", "ec")
descriptions[, numVars][is.na(descriptions[, numVars])] <- 0
# Print the votes_processed dataset
votes_processed
## # A tibble: 351,529 × 6
## rcid session vote ccode year country
## <dbl> <dbl> <dbl> <int> <dbl> <chr>
## 1 46 2 1 150 1947 Paraguay
## 2 46 2 1 91 1947 Honduras
## 3 46 2 1 212 1947 Luxembourg
## 4 46 2 3 290 1947 Poland
## 5 46 2 1 900 1947 Australia
## 6 46 2 1 140 1947 Brazil
## 7 46 2 2 530 1947 Ethiopia
## 8 46 2 1 840 1947 Philippines
## 9 46 2 3 365 1947 Russian Federation
## 10 46 2 1 160 1947 Argentina
## # ... with 351,519 more rows
# Print the descriptions dataset
descriptions
## # A tibble: 2,590 × 10
## rcid session date unres co di ec hr me nu
## <dbl> <dbl> <date> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 46 2 1947-09-04 R/2/299 0 0 0 0 0 0
## 2 47 2 1947-10-05 R/2/355 0 0 0 1 0 0
## 3 48 2 1947-10-06 R/2/461 0 0 0 0 0 0
## 4 49 2 1947-10-06 R/2/463 0 0 0 0 0 0
## 5 50 2 1947-10-06 R/2/465 0 0 0 0 0 0
## 6 51 2 1947-10-02 R/2/561 1 0 0 0 0 0
## 7 52 2 1947-11-06 R/2/650 1 0 0 0 0 0
## 8 53 2 1947-11-06 R/2/651 1 0 0 0 0 0
## 9 54 2 1947-11-06 R/2/651 1 0 0 0 0 0
## 10 55 2 1947-11-06 R/2/667 1 0 0 0 0 0
## # ... with 2,580 more rows
# Join them together based on the "rcid" and "session" columns
votes_joined <- inner_join(votes_processed, descriptions, by=c("rcid", "session"))
votes_joined # 353,720 x 14
## # A tibble: 351,529 × 14
## rcid session vote ccode year country date unres
## <dbl> <dbl> <dbl> <int> <dbl> <chr> <date> <chr>
## 1 46 2 1 150 1947 Paraguay 1947-09-04 R/2/299
## 2 46 2 1 91 1947 Honduras 1947-09-04 R/2/299
## 3 46 2 1 212 1947 Luxembourg 1947-09-04 R/2/299
## 4 46 2 3 290 1947 Poland 1947-09-04 R/2/299
## 5 46 2 1 900 1947 Australia 1947-09-04 R/2/299
## 6 46 2 1 140 1947 Brazil 1947-09-04 R/2/299
## 7 46 2 2 530 1947 Ethiopia 1947-09-04 R/2/299
## 8 46 2 1 840 1947 Philippines 1947-09-04 R/2/299
## 9 46 2 3 365 1947 Russian Federation 1947-09-04 R/2/299
## 10 46 2 1 160 1947 Argentina 1947-09-04 R/2/299
## # ... with 351,519 more rows, and 6 more variables: co <dbl>, di <dbl>,
## # ec <dbl>, hr <dbl>, me <dbl>, nu <dbl>
# Filter for votes related to colonialism
votes_joined %>%
filter(co == 1)
## # A tibble: 60,589 × 14
## rcid session vote ccode year country date
## <dbl> <dbl> <dbl> <int> <dbl> <chr> <date>
## 1 51 2 2 92 1947 El Salvador 1947-10-02
## 2 51 2 3 2 1947 United States of America 1947-10-02
## 3 51 2 2 713 1947 Taiwan, Province of China 1947-10-02
## 4 51 2 1 365 1947 Russian Federation 1947-10-02
## 5 51 2 2 840 1947 Philippines 1947-10-02
## 6 51 2 1 315 1947 Czechoslovakia 1947-10-02
## 7 51 2 1 450 1947 Liberia 1947-10-02
## 8 51 2 2 530 1947 Ethiopia 1947-10-02
## 9 51 2 3 560 1947 South Africa 1947-10-02
## 10 51 2 3 211 1947 Belgium 1947-10-02
## # ... with 60,579 more rows, and 7 more variables: unres <chr>, co <dbl>,
## # di <dbl>, ec <dbl>, hr <dbl>, me <dbl>, nu <dbl>
# Filter, then summarize by year: US_co_by_year
US_co_by_year <- votes_joined %>%
filter(country=="United States of America", co==1) %>%
group_by(year) %>%
summarize(percent_yes = mean(vote == 1))
# Graph the % of "yes" votes over time
ggplot(US_co_by_year, aes(x=year, y=percent_yes)) + geom_line()
# Gather the six mu/nu/di/hr/co/ec columns
votes_joined %>%
tidyr::gather(topic, has_topic, co:nu)
## # A tibble: 2,109,174 × 10
## rcid session vote ccode year country date unres
## <dbl> <dbl> <dbl> <int> <dbl> <chr> <date> <chr>
## 1 46 2 1 150 1947 Paraguay 1947-09-04 R/2/299
## 2 46 2 1 91 1947 Honduras 1947-09-04 R/2/299
## 3 46 2 1 212 1947 Luxembourg 1947-09-04 R/2/299
## 4 46 2 3 290 1947 Poland 1947-09-04 R/2/299
## 5 46 2 1 900 1947 Australia 1947-09-04 R/2/299
## 6 46 2 1 140 1947 Brazil 1947-09-04 R/2/299
## 7 46 2 2 530 1947 Ethiopia 1947-09-04 R/2/299
## 8 46 2 1 840 1947 Philippines 1947-09-04 R/2/299
## 9 46 2 3 365 1947 Russian Federation 1947-09-04 R/2/299
## 10 46 2 1 160 1947 Argentina 1947-09-04 R/2/299
## # ... with 2,109,164 more rows, and 2 more variables: topic <chr>,
## # has_topic <dbl>
# Perform gather again, then filter
votes_gathered <- votes_joined %>%
tidyr::gather(topic, has_topic, co:nu) %>%
filter(has_topic == 1)
votes_gathered # 350,052 x 10
## # A tibble: 347,890 × 10
## rcid session vote ccode year country date
## <dbl> <dbl> <dbl> <int> <dbl> <chr> <date>
## 1 51 2 2 92 1947 El Salvador 1947-10-02
## 2 51 2 3 2 1947 United States of America 1947-10-02
## 3 51 2 2 713 1947 Taiwan, Province of China 1947-10-02
## 4 51 2 1 365 1947 Russian Federation 1947-10-02
## 5 51 2 2 840 1947 Philippines 1947-10-02
## 6 51 2 1 315 1947 Czechoslovakia 1947-10-02
## 7 51 2 1 450 1947 Liberia 1947-10-02
## 8 51 2 2 530 1947 Ethiopia 1947-10-02
## 9 51 2 3 560 1947 South Africa 1947-10-02
## 10 51 2 3 211 1947 Belgium 1947-10-02
## # ... with 347,880 more rows, and 3 more variables: unres <chr>,
## # topic <chr>, has_topic <dbl>
# Replace the two-letter codes in topic: votes_tidied
votes_tidied <- votes_gathered %>%
mutate(topic = recode(topic,
me = "Palestinian conflict",
nu = "Nuclear weapons and nuclear material",
di = "Arms control and disarmament",
hr = "Human rights",
co = "Colonialism",
ec = "Economic development"
)
)
# Print votes_tidied
votes_tidied
## # A tibble: 347,890 × 10
## rcid session vote ccode year country date
## <dbl> <dbl> <dbl> <int> <dbl> <chr> <date>
## 1 51 2 2 92 1947 El Salvador 1947-10-02
## 2 51 2 3 2 1947 United States of America 1947-10-02
## 3 51 2 2 713 1947 Taiwan, Province of China 1947-10-02
## 4 51 2 1 365 1947 Russian Federation 1947-10-02
## 5 51 2 2 840 1947 Philippines 1947-10-02
## 6 51 2 1 315 1947 Czechoslovakia 1947-10-02
## 7 51 2 1 450 1947 Liberia 1947-10-02
## 8 51 2 2 530 1947 Ethiopia 1947-10-02
## 9 51 2 3 560 1947 South Africa 1947-10-02
## 10 51 2 3 211 1947 Belgium 1947-10-02
## # ... with 347,880 more rows, and 3 more variables: unres <chr>,
## # topic <chr>, has_topic <dbl>
# Summarize the percentage "yes" per country-year-topic
by_country_year_topic <- votes_tidied %>%
group_by(country, year, topic) %>%
summarize(total=n(), percent_yes=mean(vote == 1)) %>%
ungroup()
# Print by_country_year_topic
by_country_year_topic
## # A tibble: 26,808 × 5
## country year topic total
## <chr> <dbl> <chr> <int>
## 1 Afghanistan 1947 Colonialism 8
## 2 Afghanistan 1947 Economic development 1
## 3 Afghanistan 1947 Human rights 1
## 4 Afghanistan 1947 Palestinian conflict 6
## 5 Afghanistan 1949 Arms control and disarmament 3
## 6 Afghanistan 1949 Colonialism 22
## 7 Afghanistan 1949 Economic development 1
## 8 Afghanistan 1949 Human rights 3
## 9 Afghanistan 1949 Nuclear weapons and nuclear material 3
## 10 Afghanistan 1949 Palestinian conflict 11
## # ... with 26,798 more rows, and 1 more variables: percent_yes <dbl>
# Filter by_country_year_topic for just the US
US_by_country_year_topic <- by_country_year_topic %>%
filter(country == "United States of America")
# Plot % yes over time for the US, faceting by topic
ggplot(US_by_country_year_topic, aes(x=year, y=percent_yes)) +
geom_line() +
facet_wrap(~ topic)
# Print by_country_year_topic
by_country_year_topic
## # A tibble: 26,808 × 5
## country year topic total
## <chr> <dbl> <chr> <int>
## 1 Afghanistan 1947 Colonialism 8
## 2 Afghanistan 1947 Economic development 1
## 3 Afghanistan 1947 Human rights 1
## 4 Afghanistan 1947 Palestinian conflict 6
## 5 Afghanistan 1949 Arms control and disarmament 3
## 6 Afghanistan 1949 Colonialism 22
## 7 Afghanistan 1949 Economic development 1
## 8 Afghanistan 1949 Human rights 3
## 9 Afghanistan 1949 Nuclear weapons and nuclear material 3
## 10 Afghanistan 1949 Palestinian conflict 11
## # ... with 26,798 more rows, and 1 more variables: percent_yes <dbl>
# Fit model on the by_country_year_topic dataset
country_topic_coefficients <- by_country_year_topic %>%
tidyr::nest(-country, -topic) %>%
mutate(model = purrr::map(data, ~ lm(percent_yes ~ year, data = .)),
tidied = purrr::map(model, broom::tidy)) %>%
tidyr::unnest(tidied)
## Warning in summary.lm(x): essentially perfect fit: summary may be
## unreliable
## Warning in summary.lm(x): essentially perfect fit: summary may be
## unreliable
## Warning in summary.lm(x): essentially perfect fit: summary may be
## unreliable
## Warning in summary.lm(x): essentially perfect fit: summary may be
## unreliable
## Warning in summary.lm(x): essentially perfect fit: summary may be
## unreliable
## Warning in summary.lm(x): essentially perfect fit: summary may be
## unreliable
## Warning in summary.lm(x): essentially perfect fit: summary may be
## unreliable
## Warning in summary.lm(x): essentially perfect fit: summary may be
## unreliable
## Warning in summary.lm(x): essentially perfect fit: summary may be
## unreliable
## Warning in summary.lm(x): essentially perfect fit: summary may be
## unreliable
## Warning in summary.lm(x): essentially perfect fit: summary may be
## unreliable
## Warning in summary.lm(x): essentially perfect fit: summary may be
## unreliable
## Warning in summary.lm(x): essentially perfect fit: summary may be
## unreliable
## Warning in summary.lm(x): essentially perfect fit: summary may be
## unreliable
# Print country_topic_coefficients
country_topic_coefficients
## # A tibble: 2,371 × 7
## country topic term estimate
## <chr> <chr> <chr> <dbl>
## 1 Afghanistan Colonialism (Intercept) -13.759624843
## 2 Afghanistan Colonialism year 0.007369733
## 3 Afghanistan Economic development (Intercept) -9.196506325
## 4 Afghanistan Economic development year 0.005106200
## 5 Afghanistan Human rights (Intercept) -11.476390441
## 6 Afghanistan Human rights year 0.006239157
## 7 Afghanistan Palestinian conflict (Intercept) -7.265379964
## 8 Afghanistan Palestinian conflict year 0.004075877
## 9 Afghanistan Arms control and disarmament (Intercept) -13.304119332
## 10 Afghanistan Arms control and disarmament year 0.007145966
## # ... with 2,361 more rows, and 3 more variables: std.error <dbl>,
## # statistic <dbl>, p.value <dbl>
# Create country_topic_filtered
country_topic_filtered <- country_topic_coefficients %>%
filter(term == "year") %>%
mutate(p.adjusted = p.adjust(p.value)) %>%
filter(p.adjusted < 0.05)
country_topic_filtered %>%
arrange(estimate)
## # A tibble: 56 × 8
## country topic term
## <chr> <chr> <chr>
## 1 Vanuatu Palestinian conflict year
## 2 Vanuatu Colonialism year
## 3 Malta Economic development year
## 4 Cyprus Human rights year
## 5 United States of America Nuclear weapons and nuclear material year
## 6 Cyprus Nuclear weapons and nuclear material year
## 7 Israel Colonialism year
## 8 Romania Human rights year
## 9 Malta Arms control and disarmament year
## 10 Cyprus Arms control and disarmament year
## # ... with 46 more rows, and 5 more variables: estimate <dbl>,
## # std.error <dbl>, statistic <dbl>, p.value <dbl>, p.adjusted <dbl>
country_topic_filtered %>%
arrange(desc(estimate))
## # A tibble: 56 × 8
## country topic term estimate std.error
## <chr> <chr> <chr> <dbl> <dbl>
## 1 Malawi Palestinian conflict year 0.02008349 0.002890454
## 2 Nepal Palestinian conflict year 0.01868055 0.002207085
## 3 Barbados Palestinian conflict year 0.01658844 0.002954811
## 4 South Africa Economic development year 0.01657445 0.001879572
## 5 Malawi Colonialism year 0.01497103 0.002802337
## 6 Mongolia Economic development year 0.01394112 0.002780405
## 7 Myanmar Palestinian conflict year 0.01345118 0.002525473
## 8 South Africa Colonialism year 0.01299728 0.001447861
## 9 Portugal Colonialism year 0.01228600 0.002440573
## 10 Cuba Arms control and disarmament year 0.01129025 0.002260246
## # ... with 46 more rows, and 3 more variables: statistic <dbl>,
## # p.value <dbl>, p.adjusted <dbl>
Chapter 1 - Flight Data
Review xts fundamentals - time series data, consisting of one or more units over many periods:
Manipulating and visualizing data:
Saving and exporting time series data in R:
Example code includes:
# Create the flights dataset
flightsTotalFlights <- "8912 ; 8418 ; 9637 ; 9363 ; 9360 ; 9502 ; 9992 ; 10173 ; 9417 ; 9762 ; 9558 ; 9429 ; 9000 ; 8355 ; 9501 ; 9351 ; 9542 ; 9552 ; 9896 ; 9909 ; 8845 ; 9100 ; 8496 ; 8146 ; 8228 ; 8016 ; 8869 ; 8793 ; 8987 ; 8751 ; 8960 ; 9140 ; 8293 ; 8809 ; 8345 ; 8024 ; 8168 ; 7714 ; 9195 ; 9318 ; 9580 ; 9750 ; 10291 ; 10392 ; 9290 ; 9702 ; 9075 ; 8890 ; 8283 ; 7755 ; 9322 ; 9374 ; 9534 ; 9662 ; 10098 ; 9932 ; 9105 ; 9673 ; 9020 ; 8872 ; 8841 ; 8383 ; 9980 ; 10005 ; 10243 ; 10544 ; 10837 ; 10728 ; 9724 ; 10161 ; 9463 ; 9103"
flightsDelayFlights <-"1989 ; 1918 ; 2720 ; 1312 ; 1569 ; 1955 ; 2256 ; 2108 ; 1708 ; 1897 ; 1785 ; 2483 ; 1965 ; 1511 ; 2139 ; 2568 ; 3391 ; 2649 ; 2336 ; 2653 ; 2079 ; 1827 ; 1151 ; 889 ; 1254 ; 857 ; 1606 ; 1142 ; 1686 ; 1970 ; 2121 ; 1923 ; 1490 ; 1358 ; 1240 ; 1470 ; 1134 ; 1413 ; 2089 ; 1809 ; 2009 ; 2748 ; 3045 ; 2278 ; 1434 ; 1148 ; 1044 ; 2249 ; 1825 ; 1571 ; 1597 ; 1544 ; 1899 ; 2279 ; 2652 ; 1984 ; 1288 ; 2163 ; 1602 ; 1912 ; 1970 ; 2739 ; 2232 ; 1895 ; 1878 ; 2488 ; 2356 ; 2399 ; 1622 ; 1471 ; 1370 ; 1826"
flightsCancelFlights <- "279 ; 785 ; 242 ; 58 ; 102 ; 157 ; 222 ; 138 ; 144 ; 131 ; 99 ; 678 ; 904 ; 654 ; 153 ; 207 ; 198 ; 226 ; 208 ; 698 ; 135 ; 99 ; 79 ; 72 ; 107 ; 62 ; 72 ; 39 ; 54 ; 118 ; 89 ; 98 ; 69 ; 624 ; 90 ; 101 ; 81 ; 479 ; 218 ; 92 ; 58 ; 118 ; 150 ; 55 ; 73 ; 31 ; 55 ; 223 ; 707 ; 593 ; 191 ; 65 ; 141 ; 141 ; 181 ; 65 ; 69 ; 82 ; 51 ; 44 ; 658 ; 1123 ; 238 ; 68 ; 79 ; 138 ; 85 ; 97 ; 45 ; 57 ; 50 ; 77"
flightsDivertFlights <- "9 ; 23 ; 32 ; 7 ; 8 ; 5 ; 10 ; 20 ; 6 ; 9 ; 2 ; 6 ; 11 ; 7 ; 16 ; 10 ; 13 ; 15 ; 8 ; 17 ; 8 ; 1 ; 5 ; 2 ; 12 ; 5 ; 4 ; 1 ; 4 ; 12 ; 10 ; 6 ; 6 ; 7 ; 2 ; 10 ; 13 ; 20 ; 12 ; 6 ; 9 ; 17 ; 20 ; 9 ; 9 ; 6 ; 9 ; 18 ; 36 ; 13 ; 3 ; 5 ; 7 ; 6 ; 13 ; 7 ; 9 ; 9 ; 3 ; 10 ; 10 ; 20 ; 28 ; 10 ; 17 ; 7 ; 4 ; 23 ; 6 ; 10 ; 6 ; 10"
flightsDate <- "2010-01-01 ; 2010-02-01 ; 2010-03-01 ; 2010-04-01 ; 2010-05-01 ; 2010-06-01 ; 2010-07-01 ; 2010-08-01 ; 2010-09-01 ; 2010-10-01 ; 2010-11-01 ; 2010-12-01 ; 2011-01-01 ; 2011-02-01 ; 2011-03-01 ; 2011-04-01 ; 2011-05-01 ; 2011-06-01 ; 2011-07-01 ; 2011-08-01 ; 2011-09-01 ; 2011-10-01 ; 2011-11-01 ; 2011-12-01 ; 2012-01-01 ; 2012-02-01 ; 2012-03-01 ; 2012-04-01 ; 2012-05-01 ; 2012-06-01 ; 2012-07-01 ; 2012-08-01 ; 2012-09-01 ; 2012-10-01 ; 2012-11-01 ; 2012-12-01 ; 2013-01-01 ; 2013-02-01 ; 2013-03-01 ; 2013-04-01 ; 2013-05-01 ; 2013-06-01 ; 2013-07-01 ; 2013-08-01 ; 2013-09-01 ; 2013-10-01 ; 2013-11-01 ; 2013-12-01 ; 2014-01-01 ; 2014-02-01 ; 2014-03-01 ; 2014-04-01 ; 2014-05-01 ; 2014-06-01 ; 2014-07-01 ; 2014-08-01 ; 2014-09-01 ; 2014-10-01 ; 2014-11-01 ; 2014-12-01 ; 2015-01-01 ; 2015-02-01 ; 2015-03-01 ; 2015-04-01 ; 2015-05-01 ; 2015-06-01 ; 2015-07-01 ; 2015-08-01 ; 2015-09-01 ; 2015-10-01 ; 2015-11-01 ; 2015-12-01"
flights <- data.frame(total_flights=as.numeric(strsplit(flightsTotalFlights, " ; ")[[1]]),
delay_flights=as.numeric(strsplit(flightsDelayFlights, " ; ")[[1]]),
cancel_flights=as.numeric(strsplit(flightsCancelFlights, " ; ")[[1]]),
divert_flights=as.numeric(strsplit(flightsDivertFlights, " ; ")[[1]]),
date=as.character(strsplit(flightsDate, " ; ")[[1]]),
stringsAsFactors=FALSE
)
#View the structure of the flights data
str(flights)
## 'data.frame': 72 obs. of 5 variables:
## $ total_flights : num 8912 8418 9637 9363 9360 ...
## $ delay_flights : num 1989 1918 2720 1312 1569 ...
## $ cancel_flights: num 279 785 242 58 102 157 222 138 144 131 ...
## $ divert_flights: num 9 23 32 7 8 5 10 20 6 9 ...
## $ date : chr "2010-01-01" "2010-02-01" "2010-03-01" "2010-04-01" ...
#Examine the first five rows of the flights data
head(flights, n = 5)
## total_flights delay_flights cancel_flights divert_flights date
## 1 8912 1989 279 9 2010-01-01
## 2 8418 1918 785 23 2010-02-01
## 3 9637 2720 242 32 2010-03-01
## 4 9363 1312 58 7 2010-04-01
## 5 9360 1569 102 8 2010-05-01
#Identify class of the column containing date information
class(flights$date)
## [1] "character"
# Load the xts package
library(xts)
## Warning: package 'xts' was built under R version 3.2.5
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following object is masked from 'package:data.table':
##
## last
## The following objects are masked from 'package:dplyr':
##
## first, last
# Convert date column to a time-based class
flights$date <- as.Date(flights$date)
# Convert flights to an xts object using as.xts
flights_xts <- as.xts(flights[ , -5], order.by = flights$date)
# Check the class of flights_xts
class(flights_xts)
## [1] "xts" "zoo"
# Examine the first five lines of flights_xts
head(flights_xts, n=5)
## total_flights delay_flights cancel_flights divert_flights
## 2010-01-01 8912 1989 279 9
## 2010-02-01 8418 1918 785 23
## 2010-03-01 9637 2720 242 32
## 2010-04-01 9363 1312 58 7
## 2010-05-01 9360 1569 102 8
# Identify the periodicity of flights_xts
periodicity(flights_xts)
## Monthly periodicity from 2010-01-01 to 2015-12-01
# Identify the number of periods in flights_xts
nmonths(flights_xts)
## [1] 72
# Find data on flights arriving in BOS in June 2014
flights_xts["2014-06-01"]
## total_flights delay_flights cancel_flights divert_flights
## 2014-06-01 9662 2279 141 6
# Use plot.xts() to view total monthly flights into BOS over time
plot.xts(flights_xts$total_flights)
# Use plot.xts() to view monthly delayed flights into BOS over time
plot.xts(flights_xts$delay_flights)
# Use plot.zoo() to view all four columns of data in their own panels
labels <- c("Total", "Delay", "Cancel", "Divert")
plot.zoo(flights_xts, plot.type = "multiple", ylab = labels)
# Use plot.zoo() to view all four columns of data in one panel
lty <- 1:4
plot.zoo(flights_xts, plot.type = "single", lty = lty)
legend("right", lty = lty, legend = labels)
# Calculate percentage of flights delayed each month: pct_delay
flights_xts$pct_delay <- (flights_xts$delay_flights / flights_xts$total_flights) * 100
# Use plot.xts() to view pct_delay over time
plot.xts(flights_xts$pct_delay)
# Calculate percentage of flights cancelled each month: pct_cancel
flights_xts$pct_cancel <- (flights_xts$cancel_flights / flights_xts$total_flights) * 100
# Calculate percentage of flights diverted each month: pct_divert
flights_xts$pct_divert <- (flights_xts$divert_flights / flights_xts$total_flights) * 100
# Use plot.zoo() to view all three trends over time
plot.zoo(x = flights_xts[ , c("pct_delay", "pct_cancel", "pct_divert")])
# Save your xts object to rds file using saveRDS
saveRDS(object = flights_xts, file = "flights_xts.rds")
# Read your flights_xts data from the rds file
flights_xts2 <- readRDS("flights_xts.rds")
# Check the class of your new flights_xts2 object
class(flights_xts2)
## [1] "xts" "zoo"
# Examine the first five rows of your new flights_xts2 object
head(flights_xts2, n=5)
## total_flights delay_flights cancel_flights divert_flights
## 2010-01-01 8912 1989 279 9
## 2010-02-01 8418 1918 785 23
## 2010-03-01 9637 2720 242 32
## 2010-04-01 9363 1312 58 7
## 2010-05-01 9360 1569 102 8
## pct_delay pct_cancel pct_divert
## 2010-01-01 22.31822 3.1306104 0.10098743
## 2010-02-01 22.78451 9.3252554 0.27322404
## 2010-03-01 28.22455 2.5111549 0.33205354
## 2010-04-01 14.01260 0.6194596 0.07476236
## 2010-05-01 16.76282 1.0897436 0.08547009
# Export your xts object to a csv file using write.zoo
write.zoo(flights_xts, file = "flights_xts.csv", sep = ",")
# Open your saved object using read.zoo
flights2 <- read.zoo("flights_xts.csv", sep = ",", FUN = as.Date, header = TRUE, index.column = 1)
# Encode your new object back into xts
flights_xts2 <- as.xts(flights2)
# Examine the first five rows of your new flights_xts2 object
head(flights_xts2, n=5)
## total_flights delay_flights cancel_flights divert_flights
## 2010-01-01 8912 1989 279 9
## 2010-02-01 8418 1918 785 23
## 2010-03-01 9637 2720 242 32
## 2010-04-01 9363 1312 58 7
## 2010-05-01 9360 1569 102 8
## pct_delay pct_cancel pct_divert
## 2010-01-01 22.31822 3.1306104 0.10098743
## 2010-02-01 22.78451 9.3252554 0.27322404
## 2010-03-01 28.22455 2.5111549 0.33205354
## 2010-04-01 14.01260 0.6194596 0.07476236
## 2010-05-01 16.76282 1.0897436 0.08547009
Chapter 2 - Weather Data
Merging using rbind() - since xts objects are already ordered by time, rbind() outputs will also be ordered by time:
Merging time series data by column:
Time series data workflows:
Example code includes:
# Cached to avoid multiple pings to this server
allWeather <- data.frame()
for (getYear in 2007:2015) {
testWeather <- weatherData::getWeatherForYear(station_id="BOS", year=getYear)
# mutate does not accept input variable "Date" as a POSIXlt; convert it outside dplyr
testWeather$date <- as.Date(testWeather$Date)
testWeather <- testWeather %>%
select(-Date) %>%
mutate(min=Min_TemperatureF, mean=Mean_TemperatureF, max=Max_TemperatureF) %>%
select(min, mean, max, date)
allWeather <- rbind(allWeather, testWeather)
}
## Retrieving from: http://www.wunderground.com/history/airport/BOS/2007/1/1/CustomHistory.html?dayend=31&monthend=12&yearend=2007&req_city=NA&req_state=NA&req_statename=NA&format=1
## The following columns are available:
## [1] "CET" "Max_TemperatureF"
## [3] "Mean_TemperatureF" "Min_TemperatureF"
## [5] "Max_Dew_PointF" "MeanDew_PointF"
## [7] "Min_DewpointF" "Max_Humidity"
## [9] "Mean_Humidity" "Min_Humidity"
## [11] "Max_Sea_Level_PressureIn" "Mean_Sea_Level_PressureIn"
## [13] "Min_Sea_Level_PressureIn" "Max_VisibilityMiles"
## [15] "Mean_VisibilityMiles" "Min_VisibilityMiles"
## [17] "Max_Wind_SpeedMPH" "Mean_Wind_SpeedMPH"
## [19] "Max_Gust_SpeedMPH" "PrecipitationIn"
## [21] "CloudCover" "Events"
## [23] "WindDirDegrees"
## Checking Summarized Data Availability For BOS
## Found 365 records for 2007-01-01 to 2007-12-31
## Data is Available for the interval.
## Will be fetching these Columns:
## [1] "Date" "Max_TemperatureF" "Mean_TemperatureF"
## [4] "Min_TemperatureF"
## Retrieving from: http://www.wunderground.com/history/airport/BOS/2008/1/1/CustomHistory.html?dayend=31&monthend=12&yearend=2008&req_city=NA&req_state=NA&req_statename=NA&format=1
## The following columns are available:
## [1] "CET" "Max_TemperatureF"
## [3] "Mean_TemperatureF" "Min_TemperatureF"
## [5] "Max_Dew_PointF" "MeanDew_PointF"
## [7] "Min_DewpointF" "Max_Humidity"
## [9] "Mean_Humidity" "Min_Humidity"
## [11] "Max_Sea_Level_PressureIn" "Mean_Sea_Level_PressureIn"
## [13] "Min_Sea_Level_PressureIn" "Max_VisibilityMiles"
## [15] "Mean_VisibilityMiles" "Min_VisibilityMiles"
## [17] "Max_Wind_SpeedMPH" "Mean_Wind_SpeedMPH"
## [19] "Max_Gust_SpeedMPH" "PrecipitationIn"
## [21] "CloudCover" "Events"
## [23] "WindDirDegrees"
## Checking Summarized Data Availability For BOS
## Found 366 records for 2008-01-01 to 2008-12-31
## Data is Available for the interval.
## Will be fetching these Columns:
## [1] "Date" "Max_TemperatureF" "Mean_TemperatureF"
## [4] "Min_TemperatureF"
## Retrieving from: http://www.wunderground.com/history/airport/BOS/2009/1/1/CustomHistory.html?dayend=31&monthend=12&yearend=2009&req_city=NA&req_state=NA&req_statename=NA&format=1
## The following columns are available:
## [1] "CET" "Max_TemperatureF"
## [3] "Mean_TemperatureF" "Min_TemperatureF"
## [5] "Max_Dew_PointF" "MeanDew_PointF"
## [7] "Min_DewpointF" "Max_Humidity"
## [9] "Mean_Humidity" "Min_Humidity"
## [11] "Max_Sea_Level_PressureIn" "Mean_Sea_Level_PressureIn"
## [13] "Min_Sea_Level_PressureIn" "Max_VisibilityMiles"
## [15] "Mean_VisibilityMiles" "Min_VisibilityMiles"
## [17] "Max_Wind_SpeedMPH" "Mean_Wind_SpeedMPH"
## [19] "Max_Gust_SpeedMPH" "PrecipitationIn"
## [21] "CloudCover" "Events"
## [23] "WindDirDegrees"
## Checking Summarized Data Availability For BOS
## Found 365 records for 2009-01-01 to 2009-12-31
## Data is Available for the interval.
## Will be fetching these Columns:
## [1] "Date" "Max_TemperatureF" "Mean_TemperatureF"
## [4] "Min_TemperatureF"
## Retrieving from: http://www.wunderground.com/history/airport/BOS/2010/1/1/CustomHistory.html?dayend=31&monthend=12&yearend=2010&req_city=NA&req_state=NA&req_statename=NA&format=1
## The following columns are available:
## [1] "CET" "Max_TemperatureF"
## [3] "Mean_TemperatureF" "Min_TemperatureF"
## [5] "Max_Dew_PointF" "MeanDew_PointF"
## [7] "Min_DewpointF" "Max_Humidity"
## [9] "Mean_Humidity" "Min_Humidity"
## [11] "Max_Sea_Level_PressureIn" "Mean_Sea_Level_PressureIn"
## [13] "Min_Sea_Level_PressureIn" "Max_VisibilityMiles"
## [15] "Mean_VisibilityMiles" "Min_VisibilityMiles"
## [17] "Max_Wind_SpeedMPH" "Mean_Wind_SpeedMPH"
## [19] "Max_Gust_SpeedMPH" "PrecipitationIn"
## [21] "CloudCover" "Events"
## [23] "WindDirDegrees"
## Checking Summarized Data Availability For BOS
## Found 365 records for 2010-01-01 to 2010-12-31
## Data is Available for the interval.
## Will be fetching these Columns:
## [1] "Date" "Max_TemperatureF" "Mean_TemperatureF"
## [4] "Min_TemperatureF"
## Retrieving from: http://www.wunderground.com/history/airport/BOS/2011/1/1/CustomHistory.html?dayend=31&monthend=12&yearend=2011&req_city=NA&req_state=NA&req_statename=NA&format=1
## The following columns are available:
## [1] "CET" "Max_TemperatureF"
## [3] "Mean_TemperatureF" "Min_TemperatureF"
## [5] "Max_Dew_PointF" "MeanDew_PointF"
## [7] "Min_DewpointF" "Max_Humidity"
## [9] "Mean_Humidity" "Min_Humidity"
## [11] "Max_Sea_Level_PressureIn" "Mean_Sea_Level_PressureIn"
## [13] "Min_Sea_Level_PressureIn" "Max_VisibilityMiles"
## [15] "Mean_VisibilityMiles" "Min_VisibilityMiles"
## [17] "Max_Wind_SpeedMPH" "Mean_Wind_SpeedMPH"
## [19] "Max_Gust_SpeedMPH" "PrecipitationIn"
## [21] "CloudCover" "Events"
## [23] "WindDirDegrees"
## Checking Summarized Data Availability For BOS
## Found 365 records for 2011-01-01 to 2011-12-31
## Data is Available for the interval.
## Will be fetching these Columns:
## [1] "Date" "Max_TemperatureF" "Mean_TemperatureF"
## [4] "Min_TemperatureF"
## Retrieving from: http://www.wunderground.com/history/airport/BOS/2012/1/1/CustomHistory.html?dayend=31&monthend=12&yearend=2012&req_city=NA&req_state=NA&req_statename=NA&format=1
## The following columns are available:
## [1] "CET" "Max_TemperatureF"
## [3] "Mean_TemperatureF" "Min_TemperatureF"
## [5] "Max_Dew_PointF" "MeanDew_PointF"
## [7] "Min_DewpointF" "Max_Humidity"
## [9] "Mean_Humidity" "Min_Humidity"
## [11] "Max_Sea_Level_PressureIn" "Mean_Sea_Level_PressureIn"
## [13] "Min_Sea_Level_PressureIn" "Max_VisibilityMiles"
## [15] "Mean_VisibilityMiles" "Min_VisibilityMiles"
## [17] "Max_Wind_SpeedMPH" "Mean_Wind_SpeedMPH"
## [19] "Max_Gust_SpeedMPH" "PrecipitationIn"
## [21] "CloudCover" "Events"
## [23] "WindDirDegrees"
## Checking Summarized Data Availability For BOS
## Found 366 records for 2012-01-01 to 2012-12-31
## Data is Available for the interval.
## Will be fetching these Columns:
## [1] "Date" "Max_TemperatureF" "Mean_TemperatureF"
## [4] "Min_TemperatureF"
## Retrieving from: http://www.wunderground.com/history/airport/BOS/2013/1/1/CustomHistory.html?dayend=31&monthend=12&yearend=2013&req_city=NA&req_state=NA&req_statename=NA&format=1
## The following columns are available:
## [1] "CET" "Max_TemperatureF"
## [3] "Mean_TemperatureF" "Min_TemperatureF"
## [5] "Max_Dew_PointF" "MeanDew_PointF"
## [7] "Min_DewpointF" "Max_Humidity"
## [9] "Mean_Humidity" "Min_Humidity"
## [11] "Max_Sea_Level_PressureIn" "Mean_Sea_Level_PressureIn"
## [13] "Min_Sea_Level_PressureIn" "Max_VisibilityMiles"
## [15] "Mean_VisibilityMiles" "Min_VisibilityMiles"
## [17] "Max_Wind_SpeedMPH" "Mean_Wind_SpeedMPH"
## [19] "Max_Gust_SpeedMPH" "PrecipitationIn"
## [21] "CloudCover" "Events"
## [23] "WindDirDegrees"
## Checking Summarized Data Availability For BOS
## Found 365 records for 2013-01-01 to 2013-12-31
## Data is Available for the interval.
## Will be fetching these Columns:
## [1] "Date" "Max_TemperatureF" "Mean_TemperatureF"
## [4] "Min_TemperatureF"
## Retrieving from: http://www.wunderground.com/history/airport/BOS/2014/1/1/CustomHistory.html?dayend=31&monthend=12&yearend=2014&req_city=NA&req_state=NA&req_statename=NA&format=1
## The following columns are available:
## [1] "CET" "Max_TemperatureF"
## [3] "Mean_TemperatureF" "Min_TemperatureF"
## [5] "Max_Dew_PointF" "MeanDew_PointF"
## [7] "Min_DewpointF" "Max_Humidity"
## [9] "Mean_Humidity" "Min_Humidity"
## [11] "Max_Sea_Level_PressureIn" "Mean_Sea_Level_PressureIn"
## [13] "Min_Sea_Level_PressureIn" "Max_VisibilityMiles"
## [15] "Mean_VisibilityMiles" "Min_VisibilityMiles"
## [17] "Max_Wind_SpeedMPH" "Mean_Wind_SpeedMPH"
## [19] "Max_Gust_SpeedMPH" "PrecipitationIn"
## [21] "CloudCover" "Events"
## [23] "WindDirDegrees"
## Checking Summarized Data Availability For BOS
## Found 365 records for 2014-01-01 to 2014-12-31
## Data is Available for the interval.
## Will be fetching these Columns:
## [1] "Date" "Max_TemperatureF" "Mean_TemperatureF"
## [4] "Min_TemperatureF"
## Retrieving from: http://www.wunderground.com/history/airport/BOS/2015/1/1/CustomHistory.html?dayend=31&monthend=12&yearend=2015&req_city=NA&req_state=NA&req_statename=NA&format=1
## The following columns are available:
## [1] "CET" "Max_TemperatureF"
## [3] "Mean_TemperatureF" "Min_TemperatureF"
## [5] "Max_Dew_PointF" "MeanDew_PointF"
## [7] "Min_DewpointF" "Max_Humidity"
## [9] "Mean_Humidity" "Min_Humidity"
## [11] "Max_Sea_Level_PressureIn" "Mean_Sea_Level_PressureIn"
## [13] "Min_Sea_Level_PressureIn" "Max_VisibilityMiles"
## [15] "Mean_VisibilityMiles" "Min_VisibilityMiles"
## [17] "Max_Wind_SpeedMPH" "Mean_Wind_SpeedMPH"
## [19] "Max_Gust_SpeedMPH" "PrecipitationIn"
## [21] "CloudCover" "Events"
## [23] "WindDirDegrees"
## Checking Summarized Data Availability For BOS
## Found 365 records for 2015-01-01 to 2015-12-31
## Data is Available for the interval.
## Will be fetching these Columns:
## [1] "Date" "Max_TemperatureF" "Mean_TemperatureF"
## [4] "Min_TemperatureF"
str(allWeather)
## 'data.frame': 3287 obs. of 4 variables:
## $ min : int 21 41 30 24 23 28 24 28 30 28 ...
## $ mean: int 34 46 36 32 28 36 34 37 38 34 ...
## $ max : int 46 50 41 41 33 44 44 46 46 41 ...
## $ date: Date, format: "2007-01-01" "2007-01-02" ...
# Continuing, no need for cached data
temps_1 <- allWeather %>%
filter(date <= "2012-12-31")
temps_2 <- allWeather %>%
filter(date > "2012-12-31")
# View the structure of each object
str(temps_1)
## 'data.frame': 2192 obs. of 4 variables:
## $ min : int 21 41 30 24 23 28 24 28 30 28 ...
## $ mean: int 34 46 36 32 28 36 34 37 38 34 ...
## $ max : int 46 50 41 41 33 44 44 46 46 41 ...
## $ date: Date, format: "2007-01-01" "2007-01-02" ...
str(temps_2)
## 'data.frame': 1095 obs. of 4 variables:
## $ min : int 28 33 35 26 32 32 28 21 17 21 ...
## $ mean: int 33 40 42 36 39 36 35 27 28 30 ...
## $ max : int 38 46 50 46 46 41 42 33 39 39 ...
## $ date: Date, format: "2013-01-01" "2013-01-02" ...
# View the first and last rows of temps_1
head(temps_1)
## min mean max date
## 1 21 34 46 2007-01-01
## 2 41 46 50 2007-01-02
## 3 30 36 41 2007-01-03
## 4 24 32 41 2007-01-04
## 5 23 28 33 2007-01-05
## 6 28 36 44 2007-01-06
tail(temps_1)
## min mean max date
## 2187 32 36 40 2012-12-26
## 2188 35 40 46 2012-12-27
## 2189 35 40 46 2012-12-28
## 2190 39 40 42 2012-12-29
## 2191 33 40 48 2012-12-30
## 2192 28 36 44 2012-12-31
# View the first and last rows of temps_2
head(temps_2)
## min mean max date
## 1 28 33 38 2013-01-01
## 2 33 40 46 2013-01-02
## 3 35 42 50 2013-01-03
## 4 26 36 46 2013-01-04
## 5 32 39 46 2013-01-05
## 6 32 36 41 2013-01-06
tail(temps_2)
## min mean max date
## 1090 30 40 50 2015-12-26
## 1091 26 38 51 2015-12-27
## 1092 28 40 52 2015-12-28
## 1093 28 39 50 2015-12-29
## 1094 29 38 45 2015-12-30
## 1095 19 28 34 2015-12-31
# Confirm that the date column in each object is a time-based class
class(temps_1$date)
## [1] "Date"
class(temps_2$date)
## [1] "Date"
# Encode your two temperature data frames as xts objects
temps_1_xts <- as.xts(temps_1[, -4], order.by = temps_1$date)
temps_2_xts <- as.xts(temps_2[, -4], order.by = temps_2$date)
# View the first few lines of each new xts object to confirm they are properly formatted
head(temps_1_xts)
## min mean max
## 2007-01-01 21 34 46
## 2007-01-02 41 46 50
## 2007-01-03 30 36 41
## 2007-01-04 24 32 41
## 2007-01-05 23 28 33
## 2007-01-06 28 36 44
head(temps_2_xts)
## min mean max
## 2013-01-01 28 33 38
## 2013-01-02 33 40 46
## 2013-01-03 35 42 50
## 2013-01-04 26 36 46
## 2013-01-05 32 39 46
## 2013-01-06 32 36 41
# Use rbind to merge your new xts objects
temps_xts <- rbind(temps_1_xts, temps_2_xts)
# View data for the first 3 days of the last month of the first year in temps_xts
first(last(first(temps_xts, "1 year"), "1 month"), "3 days")
## min mean max
## 2007-12-01 32 41 50
## 2007-12-02 28 39 50
## 2007-12-03 30 40 50
# Identify the periodicity of temps_xts
periodicity(temps_xts)
## Daily periodicity from 2007-01-01 to 2015-12-31
# Generate a plot of mean Boston temperature for the duration of your data
plot.xts(temps_xts$mean)
# Generate a plot of mean Boston temperature from November 2010 through April 2011
plot.xts(temps_xts["2010-11-01/2011-04-30"]$mean)
lty <- c(3, 1, 3)
plot.zoo(temps_xts["2010-11-01/2011-04-30"], plot.type = "single", lty = lty)
# Subset your temperature data to include only 2010 through 2015: temps_xts_2
temps_xts_2 <- temps_xts["2010/2015"]
# Use to.period to convert temps_xts_2 to monthly periodicity
temps_monthly <- to.period(temps_xts_2, period = "months", OHLC = FALSE, indexAt = "firstof")
# Compare the periodicity and duration of temps_monthly and flights_xts
periodicity(temps_monthly)
## Monthly periodicity from 2010-01-01 to 2015-12-01
periodicity(flights_xts)
## Monthly periodicity from 2010-01-01 to 2015-12-01
idxRaw <- "2010-01-01 ; 2010-02-01 ; 2010-03-01 ; 2010-04-01 ; 2010-05-01 ; 2010-06-01 ; 2010-07-01 ; 2010-08-01 ; 2010-09-01 ; 2010-10-01 ; 2010-11-01 ; 2010-12-01 ; 2011-01-01 ; 2011-02-01 ; 2011-03-01 ; 2011-04-01 ; 2011-05-01 ; 2011-06-01 ; 2011-07-01 ; 2011-08-01 ; 2011-09-01 ; 2011-10-01 ; 2011-11-01 ; 2011-12-01 ; 2012-01-01 ; 2012-02-01 ; 2012-03-01 ; 2012-04-01 ; 2012-05-01 ; 2012-06-01 ; 2012-07-01 ; 2012-08-01 ; 2012-09-01 ; 2012-10-01 ; 2012-11-01 ; 2012-12-01 ; 2013-01-01 ; 2013-02-01 ; 2013-03-01 ; 2013-04-01 ; 2013-05-01 ; 2013-06-01 ; 2013-07-01 ; 2013-08-01 ; 2013-09-01 ; 2013-10-01 ; 2013-11-01 ; 2013-12-01 ; 2014-01-01 ; 2014-02-01 ; 2014-03-01 ; 2014-04-01 ; 2014-05-01 ; 2014-06-01 ; 2014-07-01 ; 2014-08-01 ; 2014-09-01 ; 2014-10-01 ; 2014-11-01 ; 2014-12-01 ; 2015-01-01 ; 2015-02-01 ; 2015-03-01 ; 2015-04-01 ; 2015-05-01 ; 2015-06-01 ; 2015-07-01 ; 2015-08-01 ; 2015-09-01 ; 2015-10-01 ; 2015-11-01 ; 2015-12-01"
index <- as.Date(strsplit(idxRaw, " ; ")[[1]])
# Split temps_xts_2 into separate lists per month
monthly_split <- split(temps_xts_2$mean , f = "months")
# Use lapply to generate the monthly mean of mean temperatures
mean_of_means <- lapply(monthly_split, FUN = mean)
# Use as.xts to generate an xts object of average monthly temperature data
temps_monthly <- as.xts(as.numeric(mean_of_means), order.by = index)
# Compare the periodicity and duration of your new temps_monthly and flights_xts
periodicity(temps_monthly)
## Monthly periodicity from 2010-01-01 to 2015-12-01
periodicity(flights_xts)
## Monthly periodicity from 2010-01-01 to 2015-12-01
# Use merge to combine your flights and temperature objects
flights_temps <- merge(flights_xts, temps_monthly)
# Examine the first few rows of your combined xts object
head(flights_temps)
## total_flights delay_flights cancel_flights divert_flights
## 2010-01-01 8912 1989 279 9
## 2010-02-01 8418 1918 785 23
## 2010-03-01 9637 2720 242 32
## 2010-04-01 9363 1312 58 7
## 2010-05-01 9360 1569 102 8
## 2010-06-01 9502 1955 157 5
## pct_delay pct_cancel pct_divert temps_monthly
## 2010-01-01 22.31822 3.1306104 0.10098743 36.12903
## 2010-02-01 22.78451 9.3252554 0.27322404 37.71429
## 2010-03-01 28.22455 2.5111549 0.33205354 42.22581
## 2010-04-01 14.01260 0.6194596 0.07476236 51.26667
## 2010-05-01 16.76282 1.0897436 0.08547009 56.87097
## 2010-06-01 20.57462 1.6522837 0.05262050 63.56667
# Use plot.zoo to plot these two columns in a single panel
lty <- c(1, 2)
plot.zoo(flights_temps[,c("pct_delay", "temps_monthly")], plot.type = "single", lty = lty)
labels <- c("Pct. Delay", "Temperature")
legend("topright", lty = lty, legend = labels, bg = "white")
windData <- "7.19 ; 5.21 ; 4.9 ; 4.7 ; 4.13 ; 4.3 ; 4.74 ; 4.94 ; 4.57 ; 4.48 ; 5.97 ; 5.87 ; 4.58 ; 6 ; 5.58 ; 5.23 ; 4.71 ; 4.5 ; 3.94 ; 4.65 ; 4.73 ; 5.39 ; 4.2 ; 5.65 ; 5.55 ; 6.03 ; 5.29 ; 5.6 ; 4.03 ; 4.1 ; 4.71 ; 4.55 ; 4.33 ; 4.77 ; 4.63 ; 5.48 ; 5.68 ; 4.82 ; 6 ; 4.93 ; 5.19 ; 4.8 ; 5.19 ; 4.74 ; 4.7 ; 3.52 ; 4.87 ; 4.45 ; 3.87 ; 3.71 ; 5.16 ; 4.2 ; 4.06 ; 4.2 ; 4.32 ; 4.19 ; 4.27 ; 4.65 ; 3.67 ; 4.13 ; 4.77 ; 4.79 ; 5.26 ; 5 ; 4.52 ; 4.47 ; 4.52 ; 4.26 ; 5.03 ; 4.29 ; 4.07 ; 3.84"
visData <- "5.77 ; 5.86 ; 5.81 ; 6 ; 6 ; 6 ; 6 ; 6 ; 5.93 ; 6 ; 5.83 ; 5.97 ; 5.61 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 5.97 ; 6 ; 5.7 ; 5.61 ; 5.71 ; 5.66 ; 5.9 ; 6.37 ; 6.39 ; 7.5 ; 7.29 ; 7.77 ; 7.8 ; 7.65 ; 7.4 ; 6.68 ; 6.81 ; 6.82 ; 7 ; 7.57 ; 6.94 ; 6.83 ; 6.48 ; 6.45 ; 7.6 ; 9.03 ; 8.2 ; 8.97 ; 6.03 ; 8.57 ; 8.58 ; 7.77 ; 7.74 ; 7.77 ; 8.03 ; 8.55 ; 7.77 ; 8.23 ; 8.2 ; 8.23 ; 8.55 ; 8.79 ; 7.9 ; 8.6 ; 8.26 ; 7.67 ; 8.06 ; 7.87 ; 8.1 ; 7.81 ; 9.33 ; 8.77"
idxData <- "2010-01-01 ; 2010-02-01 ; 2010-03-01 ; 2010-04-01 ; 2010-05-01 ; 2010-06-01 ; 2010-07-01 ; 2010-08-01 ; 2010-09-01 ; 2010-10-01 ; 2010-11-01 ; 2010-12-01 ; 2011-01-01 ; 2011-02-01 ; 2011-03-01 ; 2011-04-01 ; 2011-05-01 ; 2011-06-01 ; 2011-07-01 ; 2011-08-01 ; 2011-09-01 ; 2011-10-01 ; 2011-11-01 ; 2011-12-01 ; 2012-01-01 ; 2012-02-01 ; 2012-03-01 ; 2012-04-01 ; 2012-05-01 ; 2012-06-01 ; 2012-07-01 ; 2012-08-01 ; 2012-09-01 ; 2012-10-01 ; 2012-11-01 ; 2012-12-01 ; 2013-01-01 ; 2013-02-01 ; 2013-03-01 ; 2013-04-01 ; 2013-05-01 ; 2013-06-01 ; 2013-07-01 ; 2013-08-01 ; 2013-09-01 ; 2013-10-01 ; 2013-11-01 ; 2013-12-01 ; 2014-01-01 ; 2014-02-01 ; 2014-03-01 ; 2014-04-01 ; 2014-05-01 ; 2014-06-01 ; 2014-07-01 ; 2014-08-01 ; 2014-09-01 ; 2014-10-01 ; 2014-11-01 ; 2014-12-01 ; 2015-01-01 ; 2015-02-01 ; 2015-03-01 ; 2015-04-01 ; 2015-05-01 ; 2015-06-01 ; 2015-07-01 ; 2015-08-01 ; 2015-09-01 ; 2015-10-01 ; 2015-11-01 ; 2015-12-01"
wind <- as.xts(as.numeric(strsplit(windData, " ; ")[[1]]),
order.by=as.Date(strsplit(idxData, " ; ")[[1]])
)
vis <- as.xts(as.numeric(strsplit(visData, " ; ")[[1]]),
order.by=as.Date(strsplit(idxData, " ; ")[[1]])
)
# Confirm the periodicity and duration of the vis and wind data
periodicity(vis)
## Monthly periodicity from 2010-01-01 to 2015-12-01
periodicity(wind)
## Monthly periodicity from 2010-01-01 to 2015-12-01
# Merge vis and wind with your existing flights_temps data
flights_weather <- merge(flights_temps, vis, wind)
# View the first few rows of your flights_weather data
head(flights_weather)
## total_flights delay_flights cancel_flights divert_flights
## 2010-01-01 8912 1989 279 9
## 2010-02-01 8418 1918 785 23
## 2010-03-01 9637 2720 242 32
## 2010-04-01 9363 1312 58 7
## 2010-05-01 9360 1569 102 8
## 2010-06-01 9502 1955 157 5
## pct_delay pct_cancel pct_divert temps_monthly vis wind
## 2010-01-01 22.31822 3.1306104 0.10098743 36.12903 5.77 7.19
## 2010-02-01 22.78451 9.3252554 0.27322404 37.71429 5.86 5.21
## 2010-03-01 28.22455 2.5111549 0.33205354 42.22581 5.81 4.90
## 2010-04-01 14.01260 0.6194596 0.07476236 51.26667 6.00 4.70
## 2010-05-01 16.76282 1.0897436 0.08547009 56.87097 6.00 4.13
## 2010-06-01 20.57462 1.6522837 0.05262050 63.56667 6.00 4.30
Chapter 3 - Economic Data
Handling missingness - missing values confound identification of trends and/or statistical tests:
Lagging and differencing - moving averages in the data:
Rolling functions:
Example code includes:
gdpDate <- "1947 Q1 ; 1947 Q2 ; 1947 Q3 ; 1947 Q4 ; 1948 Q1 ; 1948 Q2 ; 1948 Q3 ; 1948 Q4 ; 1949 Q1 ; 1949 Q2 ; 1949 Q3 ; 1949 Q4 ; 1950 Q1 ; 1950 Q2 ; 1950 Q3 ; 1950 Q4 ; 1951 Q1 ; 1951 Q2 ; 1951 Q3 ; 1951 Q4 ; 1952 Q1 ; 1952 Q2 ; 1952 Q3 ; 1952 Q4 ; 1953 Q1 ; 1953 Q2 ; 1953 Q3 ; 1953 Q4 ; 1954 Q1 ; 1954 Q2 ; 1954 Q3 ; 1954 Q4 ; 1955 Q1 ; 1955 Q2 ; 1955 Q3 ; 1955 Q4 ; 1956 Q1 ; 1956 Q2 ; 1956 Q3 ; 1956 Q4 ; 1957 Q1 ; 1957 Q2 ; 1957 Q3 ; 1957 Q4 ; 1958 Q1 ; 1958 Q2 ; 1958 Q3 ; 1958 Q4 ; 1959 Q1 ; 1959 Q2 ; 1959 Q3 ; 1959 Q4 ; 1960 Q1 ; 1960 Q2 ; 1960 Q3 ; 1960 Q4 ; 1961 Q1 ; 1961 Q2 ; 1961 Q3 ; 1961 Q4 ; 1962 Q1 ; 1962 Q2 ; 1962 Q3 ; 1962 Q4 ; 1963 Q1 ; 1963 Q2 ; 1963 Q3 ; 1963 Q4 ; 1964 Q1 ; 1964 Q2 ; 1964 Q3 ; 1964 Q4 ; 1965 Q1 ; 1965 Q2 ; 1965 Q3 ; 1965 Q4 ; 1966 Q1 ; 1966 Q2 ; 1966 Q3 ; 1966 Q4 ; 1967 Q1 ; 1967 Q2 ; 1967 Q3 ; 1967 Q4 ; 1968 Q1 ; 1968 Q2 ; 1968 Q3 ; 1968 Q4 ; 1969 Q1 ; 1969 Q2 ; 1969 Q3 ; 1969 Q4 ; 1970 Q1 ; 1970 Q2 ; 1970 Q3 ; 1970 Q4 ; 1971 Q1 ; 1971 Q2 ; 1971 Q3 ; 1971 Q4 ; 1972 Q1 ; 1972 Q2 ; 1972 Q3 ; 1972 Q4 ; 1973 Q1 ; 1973 Q2 ; 1973 Q3 ; 1973 Q4 ; 1974 Q1 ; 1974 Q2 ; 1974 Q3 ; 1974 Q4 ; 1975 Q1 ; 1975 Q2 ; 1975 Q3 ; 1975 Q4 ; 1976 Q1 ; 1976 Q2 ; 1976 Q3 ; 1976 Q4 ; 1977 Q1 ; 1977 Q2 ; 1977 Q3 ; 1977 Q4 ; 1978 Q1 ; 1978 Q2 ; 1978 Q3 ; 1978 Q4 ; 1979 Q1 ; 1979 Q2 ; 1979 Q3 ; 1979 Q4 ; 1980 Q1 ; 1980 Q2 ; 1980 Q3 ; 1980 Q4 ; 1981 Q1 ; 1981 Q2 ; 1981 Q3 ; 1981 Q4 ; 1982 Q1 ; 1982 Q2 ; 1982 Q3 ; 1982 Q4 ; 1983 Q1 ; 1983 Q2 ; 1983 Q3 ; 1983 Q4 ; 1984 Q1 ; 1984 Q2 ; 1984 Q3 ; 1984 Q4 ; 1985 Q1 ; 1985 Q2 ; 1985 Q3 ; 1985 Q4 ; 1986 Q1 ; 1986 Q2 ; 1986 Q3 ; 1986 Q4 ; 1987 Q1 ; 1987 Q2 ; 1987 Q3 ; 1987 Q4 ; 1988 Q1 ; 1988 Q2 ; 1988 Q3 ; 1988 Q4 ; 1989 Q1 ; 1989 Q2 ; 1989 Q3 ; 1989 Q4 ; 1990 Q1 ; 1990 Q2 ; 1990 Q3 ; 1990 Q4 ; 1991 Q1 ; 1991 Q2 ; 1991 Q3 ; 1991 Q4 ; 1992 Q1 ; 1992 Q2 ; 1992 Q3 ; 1992 Q4 ; 1993 Q1 ; 1993 Q2 ; 1993 Q3 ; 1993 Q4 ; 1994 Q1 ; 1994 Q2 ; 1994 Q3 ; 1994 Q4 ; 1995 Q1 ; 1995 Q2 ; 1995 Q3 ; 1995 Q4 ; 1996 Q1 ; 1996 Q2 ; 1996 Q3 ; 1996 Q4 ; 1997 Q1 ; 1997 Q2 ; 1997 Q3 ; 1997 Q4 ; 1998 Q1 ; 1998 Q2 ; 1998 Q3 ; 1998 Q4 ; 1999 Q1 ; 1999 Q2 ; 1999 Q3 ; 1999 Q4 ; 2000 Q1 ; 2000 Q2 ; 2000 Q3 ; 2000 Q4 ; 2001 Q1 ; 2001 Q2 ; 2001 Q3 ; 2001 Q4 ; 2002 Q1 ; 2002 Q2 ; 2002 Q3 ; 2002 Q4 ; 2003 Q1 ; 2003 Q2 ; 2003 Q3 ; 2003 Q4 ; 2004 Q1 ; 2004 Q2 ; 2004 Q3 ; 2004 Q4 ; 2005 Q1 ; 2005 Q2 ; 2005 Q3 ; 2005 Q4 ; 2006 Q1 ; 2006 Q2 ; 2006 Q3 ; 2006 Q4 ; 2007 Q1 ; 2007 Q2 ; 2007 Q3 ; 2007 Q4 ; 2008 Q1 ; 2008 Q2 ; 2008 Q3 ; 2008 Q4 ; 2009 Q1 ; 2009 Q2 ; 2009 Q3 ; 2009 Q4 ; 2010 Q1 ; 2010 Q2 ; 2010 Q3 ; 2010 Q4 ; 2011 Q1 ; 2011 Q2 ; 2011 Q3 ; 2011 Q4 ; 2012 Q1 ; 2012 Q2 ; 2012 Q3 ; 2012 Q4 ; 2013 Q1 ; 2013 Q2 ; 2013 Q3 ; 2013 Q4 ; 2014 Q1 ; 2014 Q2 ; 2014 Q3 ; 2014 Q4 ; 2015 Q1 ; 2015 Q2 ; 2015 Q3 ; 2015 Q4 ; 2016 Q1 ; 2016 Q2 ; 2016 Q3"
gdpGDP <- "243.1 ; 246.3 ; 250.1 ; 260.3 ; 266.2 ; 272.9 ; 279.5 ; 280.7 ; 275.4 ; NA ; NA ; 271 ; 281.2 ; NA ; 308.5 ; 320.3 ; 336.4 ; NA ; 351.8 ; 356.6 ; NA ; NA ; NA ; 381.2 ; 388.5 ; NA ; NA ; NA ; NA ; NA ; 391.6 ; 400.3 ; 413.8 ; 422.2 ; 430.9 ; NA ; NA ; 446.8 ; 452 ; 461.3 ; 470.6 ; 472.8 ; NA ; NA ; NA ; NA ; 486.7 ; 500.4 ; 511.1 ; 524.2 ; 525.2 ; 529.3 ; 543.3 ; 542.7 ; 546 ; 541.1 ; 545.9 ; 557.4 ; 568.2 ; 581.6 ; 595.2 ; 602.6 ; 609.6 ; NA ; NA ; NA ; NA ; 654.8 ; 671.1 ; 680.8 ; 692.8 ; 698.4 ; 719.2 ; 732.4 ; NA ; NA ; NA ; NA ; 820.8 ; 834.9 ; 846 ; 851.1 ; 866.6 ; 883.2 ; NA ; 936.3 ; 952.3 ; NA ; 995.4 ; 1011.4 ; 1032 ; 1040.7 ; 1053.5 ; 1070.1 ; NA ; 1091.5 ; 1137.8 ; 1159.4 ; 1180.3 ; 1193.6 ; 1233.8 ; NA ; NA ; 1332 ; 1380.7 ; 1417.6 ; 1436.8 ; 1479.1 ; 1494.7 ; 1534.2 ; NA ; 1603 ; NA ; NA ; 1713.8 ; 1765.9 ; 1824.5 ; 1856.9 ; 1890.5 ; 1938.4 ; 1992.5 ; 2060.2 ; 2122.4 ; NA ; NA ; 2336.6 ; 2398.9 ; 2482.2 ; 2531.6 ; NA ; 2670.4 ; 2730.7 ; 2796.5 ; 2799.9 ; 2860 ; NA ; 3131.8 ; 3167.3 ; 3261.2 ; 3283.5 ; 3273.8 ; NA ; NA ; NA ; 3480.3 ; 3583.8 ; 3692.3 ; 3796.1 ; NA ; NA ; NA ; NA ; 4237 ; 4302.3 ; 4394.6 ; 4453.1 ; NA ; NA ; NA ; NA ; 4736.2 ; 4821.5 ; 4900.5 ; 5022.7 ; NA ; NA ; NA ; NA ; NA ; NA ; 5711.6 ; 5763.4 ; 5890.8 ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; 7545.3 ; 7604.9 ; 7706.5 ; 7799.5 ; 7893.1 ; 8061.5 ; 8159 ; 8287.1 ; 8402.1 ; 8551.9 ; 8691.8 ; 8788.3 ; 8889.7 ; 8994.7 ; 9146.5 ; 9325.7 ; 9447.1 ; NA ; 9712.3 ; 9926.1 ; 10031 ; 10278.3 ; 10357.4 ; 10472.3 ; 10508.1 ; 10638.4 ; 10639.5 ; 10701.3 ; NA ; NA ; 11037.1 ; 11103.8 ; 11230.1 ; 11370.7 ; 11625.1 ; 11816.8 ; 11988.4 ; 12181.4 ; 12367.7 ; 12562.2 ; 12813.7 ; 12974.1 ; 13205.4 ; 13381.6 ; 13648.9 ; NA ; 13908.5 ; 14066.4 ; 14233.2 ; 14422.3 ; 14569.7 ; 14685.3 ; 14668.4 ; 14813 ; 14843 ; 14549.9 ; 14383.9 ; 14340.4 ; 14384.1 ; 14566.5 ; 14681.1 ; 14888.6 ; 15057.7 ; 15230.2 ; NA ; 15460.9 ; 15587.1 ; 15785.3 ; 15973.9 ; 16121.9 ; 16227.9 ; 16297.3 ; 16475.4 ; 16541.4 ; 16749.3 ; 16999.9 ; 17025.2 ; 17285.6 ; 17569.4 ; 17692.2 ; NA ; 17998.3 ; 18141.9 ; 18222.8 ; 18281.6 ; 18450.1 ; 18651.2"
gdp <- data.frame(date=strsplit(gdpDate, " ; ")[[1]],
gdp_billions=as.numeric(strsplit(gdpGDP, " ; ")[[1]]),
stringsAsFactors=TRUE
) # want the date to be a factor to match input
## Warning in data.frame(date = strsplit(gdpDate, " ; ")[[1]], gdp_billions =
## as.numeric(strsplit(gdpGDP, : NAs introduced by coercion
sum(is.na(gdp))
## [1] 80
str(gdp)
## 'data.frame': 279 obs. of 2 variables:
## $ date : Factor w/ 279 levels "1947 Q1","1947 Q2",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ gdp_billions: num 243 246 250 260 266 ...
# Get a summary of your GDP data
summary(gdp)
## date gdp_billions
## 1947 Q1: 1 Min. : 243.1
## 1947 Q2: 1 1st Qu.: 708.8
## 1947 Q3: 1 Median : 3167.3
## 1947 Q4: 1 Mean : 6186.6
## 1948 Q1: 1 3rd Qu.:11497.9
## 1948 Q2: 1 Max. :18651.2
## (Other):273 NA's :80
# Convert GDP date column to time object
gdp$date <- as.yearqtr(gdp$date)
# Convert GDP data to xts
gdp_xts <- as.xts(gdp[, -1], order.by = gdp$date)
names(gdp_xts) <- "gdp"
# Plot GDP data over time
plot.xts(gdp_xts)
# Fill NAs in gdp_xts with the last observation carried forward
gdp_locf <- na.locf(gdp_xts)
# Fill NAs in gdp_xts with the next observation carried backward
gdp_nocb <- na.locf(gdp_xts, fromLast=TRUE)
# Produce a plot for each of your new xts objects
par(mfrow = c(2,1))
plot.xts(gdp_locf, major.format = "%Y")
plot.xts(gdp_nocb, major.format = "%Y")
par(mfrow = c(1,1))
# Query for GDP in 1993 in both gdp_locf and gdp_nocb
gdp_locf["1993"]
## gdp
## 1993 Q2 5890.8
## 1993 Q3 5890.8
## 1993 Q4 5890.8
## 1994 Q1 5890.8
gdp_nocb["1993"]
## gdp
## 1993 Q2 7545.3
## 1993 Q3 7545.3
## 1993 Q4 7545.3
## 1994 Q1 7545.3
# Fill NAs in gdp_xts using linear approximation
gdp_approx <- na.approx(gdp_xts)
# Plot your new xts object
plot.xts(gdp_approx, major.format = "%Y")
# Query for GDP in 1993 in gdp_approx
gdp_approx["1993"]
## gdp
## 1993 Q2 6966.225
## 1993 Q3 7048.950
## 1993 Q4 7131.675
## 1994 Q1 7214.400
unemCore1 <- "7.9 ; 7.7 ; 7.6 ; 7.7 ; 7.4 ; 7.6 ; NA ; NA ; 7.6 ; 7.7 ; 7.8 ; NA ; 7.5 ; 7.6 ; 7.4 ; 7.2 ; 7 ; 7.2 ; 6.9 ; 7 ; 6.8 ; NA ; NA ; 6.4 ; 6.4 ; 6.3 ; 6.3 ; 6.1 ; 6 ; 5.9 ; 6.2 ; 5.9 ; 6 ; 5.8 ; 5.9 ; 6 ; 5.9 ; 5.9 ; NA ; NA ; NA ; NA ; NA ; 6 ; 5.9 ; 6 ; 5.9 ; 6 ; 6.3 ; NA ; NA ; 6.9 ; 7.5 ; 7.6 ; 7.8 ; 7.7 ; 7.5 ; NA ; NA ; 7.2 ; 7.5 ; 7.4 ; 7.4 ; 7.2 ; NA ; NA ; 7.2 ; 7.4 ; 7.6 ; 7.9 ; 8.3 ; 8.5 ; 8.6 ; 8.9 ; 9 ; 9.3 ; 9.4 ; NA ; NA ; NA ; NA ; NA ; 10.8 ; 10.8 ; 10.4 ; 10.4 ; 10.3 ; 10.2 ; 10.1 ; 10.1 ; 9.4 ; 9.5 ; 9.2 ; NA ; NA ; NA ; 8 ; NA ; 7.8 ; NA ; NA ; 7.2 ; 7.5 ; 7.5 ; 7.3 ; 7.4 ; 7.2 ; 7.3 ; 7.3 ; 7.2 ; 7.2 ; 7.3 ; 7.2 ; 7.4 ; 7.4 ; 7.1 ; 7.1 ; 7.1 ; 7 ; 7 ; 6.7 ; 7.2 ; 7.2 ; 7.1 ; 7.2 ; 7.2 ; 7 ; 6.9 ; 7 ; 7 ; 6.9 ; 6.6 ; 6.6 ; 6.6 ; 6.6 ; 6.3 ; 6.3 ; 6.2 ; 6.1 ; 6 ; 5.9 ; 6 ; 5.8 ; 5.7 ; NA ; NA ; NA ; 5.4 ; 5.6 ; 5.4 ; 5.4 ; 5.6 ; 5.4 ; NA ; NA ; NA ; 5.4 ; 5.2 ; 5 ; 5.2 ; NA ; NA ; NA ; NA ; 5.3 ; 5.3 ; 5.4 ; NA ; 5.4 ; 5.3 ; 5.2 ; 5.4 ; 5.4 ; 5.2 ; 5.5 ; 5.7 ; 5.9 ; 5.9 ; 6.2 ; 6.3 ; 6.4 ; 6.6 ; 6.8 ; 6.7 ; 6.9 ; 6.9 ; 6.8 ; 6.9 ; 6.9 ; 7 ; 7 ; 7.3 ; 7.3 ; 7.4 ; 7.4 ; 7.4 ; 7.6 ; 7.8 ; 7.7 ; 7.6 ; 7.6 ; 7.3 ; 7.4 ; 7.4 ; 7.3 ; 7.1 ; 7 ; 7.1 ; 7.1 ; 7 ; 6.9 ; 6.8 ; 6.7 ; 6.8 ; 6.6 ; 6.5 ; 6.6 ; NA ; 6.5 ; 6.4 ; 6.1 ; NA ; NA ; 6 ; 5.9 ; 5.8 ; 5.6 ; 5.5 ; 5.6 ; 5.4 ; 5.4 ; 5.8 ; 5.6 ; 5.6 ; 5.7 ; 5.7 ; 5.6 ; 5.5 ; 5.6 ; NA ; NA ; NA ; NA ; NA ; 5.6 ; 5.3 ; 5.5 ; 5.1 ; 5.2 ; 5.2 ; 5.4 ; 5.4 ; 5.3 ; 5.2 ; 5.2 ; 5.1 ; 4.9 ; 5 ; 4.9 ; 4.8 ; 4.9 ; 4.7 ; 4.6 ; 4.7 ; 4.6 ; 4.6 ; 4.7 ; 4.3 ; 4.4 ; 4.5 ; 4.5 ; 4.5 ; 4.6 ; 4.5 ; NA ; NA ; NA ; 4.4 ; 4.2 ; 4.3 ; 4.2 ; 4.3 ; 4.3 ; 4.2 ; 4.2 ; 4.1 ; 4.1 ; 4 ; 4 ; 4.1 ; 4 ; 3.8 ; 4 ; 4 ; 4 ; 4.1 ; 3.9 ; NA ; NA ; NA ; 4.2 ; 4.2 ; 4.3 ; 4.4 ; 4.3 ; 4.5 ; 4.6 ; 4.9 ; 5 ; 5.3 ; 5.5 ; 5.7 ; NA ; NA ; NA ; 5.9 ; 5.8 ; 5.8 ; 5.8 ; 5.7 ; 5.7 ; 5.7 ; 5.9 ; 6 ; 5.8 ; 5.9 ; 5.9 ; 6 ; 6.1 ; 6.3 ; 6.2 ; 6.1 ; 6.1 ; 6 ; 5.8 ; 5.7 ; 5.7 ; 5.6 ; 5.8 ; 5.6 ; 5.6 ; 5.6 ; 5.5 ; 5.4 ; 5.4 ; 5.5 ; 5.4 ; 5.4 ; 5.3 ; 5.4 ; 5.2 ; 5.2 ; 5.1 ; 5 ; 5 ; 4.9 ; 5 ; 5 ; 5 ; 4.9 ; 4.7 ; 4.8 ; 4.7 ; 4.7 ; 4.6 ; 4.6 ; 4.7 ; 4.7 ; 4.5 ; 4.4 ; 4.5 ; 4.4 ; 4.6 ; 4.5 ; 4.4 ; 4.5 ; 4.4 ; 4.6 ; 4.7 ; 4.6 ; 4.7 ; 4.7 ; 4.7 ; 5 ; 5 ; 4.9 ; 5.1 ; 5 ; 5.4 ; 5.6 ; 5.8 ; NA ; NA ; NA ; 6.8 ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; 10 ; 9.9 ; 9.9 ; 9.8 ; 9.8 ; 9.9 ; 9.9 ; 9.6 ; 9.4 ; 9.4 ; 9.5 ; 9.5 ; 9.4 ; 9.8 ; 9.3 ; 9.1 ; 9 ; 9 ; 9.1 ; 9 ; 9.1 ; 9 ; 9 ; 9 ; 8.8 ; NA ; NA ; NA ; NA ; NA ; 8.2 ; 8.2 ; 8.2 ; 8.2 ; 8.1 ; 7.8 ; 7.8 ; 7.7 ; 7.9 ; 8 ; 7.7 ; 7.5 ; 7.6 ; 7.5 ; 7.5 ; 7.3 ; 7.3 ; 7.3 ; 7.2 ; 6.9 ; 6.7 ; 6.6 ; 6.7 ; 6.7 ; 6.2 ; 6.2 ; 6.1 ; 6.2 ; 6.2 ; 6 ; 5.7 ; 5.8 ; 5.6 ; 5.7 ; 5.5 ; 5.5 ; 5.4 ; 5.5 ; 5.3 ; 5.3 ; 5.1 ; 5.1 ; 5 ; 5 ; 5 ; 11.6 ; NA ; 10.9 ; NA ; 9.4 ; 9.8 ; 9.7 ; 9 ; 9 ; 8.3 ; 8.3 ; 8.2 ; 9.5 ; 9.2 ; 8.8 ; NA ; 7.6 ; 8.2 ; 7.8 ; NA ; 7.5 ; 6.6 ; NA ; 6.2 ; 7.5 ; 7.2 ; 6.9 ; 6 ; 5.9 ; 6.4 ; 6.5"
unemCore2 <- "5.9 ; 6 ; 5.2 ; NA ; 5.7 ; 6.7 ; 6.4 ; 6.2 ; 5.4 ; 5.2 ; 5.7 ; 5.4 ; NA ; 5.4 ; 4.7 ; 4.8 ; 4.9 ; 6.1 ; 5.8 ; 5.8 ; 5.3 ; 5.6 ; 6.1 ; 6.1 ; 5.7 ; 5.6 ; 5.2 ; NA ; 5.1 ; 6.7 ; 6.4 ; 6.3 ; 5.7 ; 5.9 ; 6.5 ; NA ; 6.5 ; 6.8 ; 6.4 ; 6.7 ; 7 ; 8.4 ; 8.2 ; 8 ; 7.5 ; 7.5 ; 8 ; 8 ; 7.6 ; 7.6 ; 7.1 ; 7.3 ; 7.4 ; 8.3 ; 8 ; 7.7 ; 6.9 ; 6.8 ; 7.1 ; 6.6 ; 6.4 ; 6.4 ; 5.7 ; 5.6 ; 5.6 ; 6.5 ; 6 ; 5.7 ; 5 ; 4.5 ; 4.7 ; 4.7 ; 4.4 ; 4.4 ; 3.7 ; 3.7 ; 3.8 ; 4.9 ; 4.5 ; 4.4 ; NA ; 3.8 ; 4.2 ; 4.2 ; 3.8 ; 3.8 ; 3.4 ; 3.4 ; 3.5 ; NA ; 4.4 ; 4.3 ; 3.8 ; 3.9 ; 4.2 ; 4.1 ; 3.8 ; 3.9 ; 3.3 ; 3.3 ; NA ; 4.3 ; 4.2 ; 4 ; 3.3 ; 3.2 ; 3.4 ; 3.3 ; 2.8 ; 2.9 ; 2.4 ; 2.5 ; 2.6 ; 3.9 ; 3.7 ; 3.6 ; 3 ; 3 ; 3.4 ; 3.4 ; 3.1 ; 3.2 ; 2.8 ; 3 ; 3.1 ; 4.3 ; 4.1 ; 4 ; 3.7 ; 3.7 ; 4.2 ; 4.3 ; 4.1 ; 4.4 ; 4.1 ; 4.3 ; 4.5 ; 5.8 ; NA ; 5.9 ; 5.6 ; 5.7 ; 6.1 ; 6.5 ; 6.4 ; 6.8 ; 6.5 ; 7 ; 7.3 ; 8.7 ; 8.8 ; 8.9 ; 8.3 ; 8.6 ; 8.9 ; 8.9 ; 8.7 ; 8.8 ; 8.3 ; NA ; NA ; 9.4 ; 9.2 ; 9 ; 8.3 ; 8.4 ; 8.8 ; 8.7 ; 8.3 ; 8.4 ; 7.7 ; NA ; 7.6 ; 8.6 ; 8.2 ; 7.8 ; 7.1 ; 6.9 ; 7.1 ; 6.9 ; 6.5 ; 6.6 ; 6.1 ; 6 ; 6 ; 7.3 ; 6.9 ; 6.7 ; 6.1 ; 5.8 ; 6.2 ; 6.1 ; 5.8 ; 5.8 ; 5.4 ; 5.3 ; 5.3 ; 6.4 ; 5.9 ; 5.7 ; 5.3 ; NA ; 5.5 ; 5.5 ; 5.1 ; 5.2 ; 4.6 ; 4.6 ; 4.6 ; 5.7 ; 5.2 ; 5 ; 4.4 ; 4.4 ; 4.5 ; 4.6 ; 4 ; 4.2 ; 3.7 ; 3.8 ; 3.8 ; 4.9 ; 4.6 ; 4.4 ; 3.9 ; 3.8 ; 4.2 ; 4.1 ; 3.7 ; 3.8 ; 3.2 ; 3.2 ; 3.3 ; 4.2 ; 3.9 ; 3.8 ; 3 ; 3.1 ; 3.5 ; 3.4 ; 3 ; 3.2 ; 2.8 ; 2.8 ; 2.9 ; 3.9 ; 3.6 ; 3.4 ; 3.1 ; 3 ; 3.4 ; 3.5 ; 3 ; 3.2 ; 2.7 ; 2.7 ; 2.7 ; 3.6 ; 3.3 ; 3.1 ; 2.4 ; 2.5 ; 2.8 ; 2.8 ; 2.5 ; 2.5 ; 2.1 ; 2.3 ; 2.4 ; 3.7 ; 3.5 ; 3.6 ; 3.2 ; 3.3 ; 3.8 ; 3.9 ; 3.8 ; 4 ; 3.7 ; 4 ; 4.2 ; 5.5 ; 5.3 ; 5.3 ; 5 ; 5 ; 5.5 ; 5.5 ; 5.3 ; 5.4 ; 5 ; 5.2 ; 5.2 ; 6.3 ; 6 ; 5.9 ; 5.5 ; 5.6 ; 6.1 ; 5.9 ; 5.7 ; 5.8 ; 5.2 ; 5.3 ; 5.2 ; 6.2 ; 5.8 ; 5.7 ; 5.1 ; 5.1 ; 5.5 ; 5.3 ; 4.8 ; 4.9 ; 4.4 ; 4.5 ; 4.5 ; 5.6 ; 5.4 ; 5.1 ; 4.7 ; 4.6 ; 5 ; 4.9 ; 4.5 ; 4.9 ; 4.4 ; 4.7 ; 4.6 ; 5.5 ; 5.4 ; 5.2 ; 4.9 ; 4.7 ; 5.1 ; 5 ; 4.7 ; 4.9 ; 4.3 ; 4.5 ; 4.6 ; 5.6 ; 5.2 ; 4.8 ; NA ; NA ; 4.8 ; 4.7 ; 4.3 ; 4.5 ; 4 ; 4.1 ; 4.4 ; 5.4 ; 5.2 ; 5 ; 4.6 ; 5 ; 5.6 ; 5.7 ; 5.6 ; 5.9 ; 5.7 ; 6.1 ; 6.6 ; 7.9 ; 7.9 ; 7.8 ; 7.3 ; 7.7 ; 8.3 ; 8.4 ; 8.2 ; 8.6 ; 8.2 ; 8.3 ; 8.5 ; 9.6 ; 9.2 ; 8.9 ; 8.3 ; 8.2 ; 8.4 ; 8.3 ; 7.9 ; 8 ; 7.5 ; 7.7 ; 7.6 ; 8.5 ; 8.1 ; 7.7 ; 7.2 ; 7.1 ; 7.6 ; 7.4 ; 6.9 ; 7.1 ; 6.5 ; 6.4 ; 6.6 ; 7.4 ; 7.2 ; 6.8 ; 6.3 ; 6.3 ; 6.9 ; 6.9 ; 6.6 ; 6.6 ; 6.2 ; 6.2 ; 6.5 ; 7.6 ; 7.2 ; 7 ; 6.6 ; 6.6 ; 7.3 ; 7 ; 6.6 ; 6.6 ; 6.2 ; 6 ; 5.9 ; 6.8 ; 6.5 ; 6.2 ; 5.5 ; 5.5 ; 6 ; 6 ; NA ; 5.7 ; 5 ; 5 ; 4.9 ; 5.8 ; 5.5 ; 5.2 ; 4.7 ; 4.9 ; 5.2 ; 5.2 ; 4.7 ; 4.9 ; 4.5 ; 4.5 ; 4.6"
unemIndex1 <- "Jan 1976 ; Feb 1976 ; Mar 1976 ; Apr 1976 ; May 1976 ; Jun 1976 ; Jul 1976 ; Aug 1976 ; Sep 1976 ; Oct 1976 ; Nov 1976 ; Dec 1976 ; Jan 1977 ; Feb 1977 ; Mar 1977 ; Apr 1977 ; May 1977 ; Jun 1977 ; Jul 1977 ; Aug 1977 ; Sep 1977 ; Oct 1977 ; Nov 1977 ; Dec 1977 ; Jan 1978 ; Feb 1978 ; Mar 1978 ; Apr 1978 ; May 1978 ; Jun 1978 ; Jul 1978 ; Aug 1978 ; Sep 1978 ; Oct 1978 ; Nov 1978 ; Dec 1978 ; Jan 1979 ; Feb 1979 ; Mar 1979 ; Apr 1979 ; May 1979 ; Jun 1979 ; Jul 1979 ; Aug 1979 ; Sep 1979 ; Oct 1979 ; Nov 1979 ; Dec 1979 ; Jan 1980 ; Feb 1980 ; Mar 1980 ; Apr 1980 ; May 1980 ; Jun 1980 ; Jul 1980 ; Aug 1980 ; Sep 1980 ; Oct 1980 ; Nov 1980 ; Dec 1980 ; Jan 1981 ; Feb 1981 ; Mar 1981 ; Apr 1981 ; May 1981 ; Jun 1981 ; Jul 1981 ; Aug 1981 ; Sep 1981 ; Oct 1981 ; Nov 1981 ; Dec 1981 ; Jan 1982 ; Feb 1982 ; Mar 1982 ; Apr 1982 ; May 1982 ; Jun 1982 ; Jul 1982 ; Aug 1982 ; Sep 1982 ; Oct 1982 ; Nov 1982 ; Dec 1982 ; Jan 1983 ; Feb 1983 ; Mar 1983 ; Apr 1983 ; May 1983 ; Jun 1983 ; Jul 1983 ; Aug 1983 ; Sep 1983 ; Oct 1983 ; Nov 1983 ; Dec 1983 ; Jan 1984 ; Feb 1984 ; Mar 1984 ; Apr 1984 ; May 1984 ; Jun 1984 ; Jul 1984 ; Aug 1984 ; Sep 1984 ; Oct 1984 ; Nov 1984 ; Dec 1984 ; Jan 1985 ; Feb 1985 ; Mar 1985 ; Apr 1985 ; May 1985 ; Jun 1985 ; Jul 1985 ; Aug 1985 ; Sep 1985 ; Oct 1985 ; Nov 1985 ; Dec 1985 ; Jan 1986 ; Feb 1986 ; Mar 1986 ; Apr 1986 ; May 1986 ; Jun 1986 ; Jul 1986 ; Aug 1986 ; Sep 1986 ; Oct 1986 ; Nov 1986 ; Dec 1986 ; Jan 1987 ; Feb 1987 ; Mar 1987 ; Apr 1987 ; May 1987 ; Jun 1987 ; Jul 1987 ; Aug 1987 ; Sep 1987 ; Oct 1987 ; Nov 1987 ; Dec 1987 ; Jan 1988 ; Feb 1988 ; Mar 1988 ; Apr 1988 ; May 1988 ; Jun 1988 ; Jul 1988 ; Aug 1988 ; Sep 1988 ; Oct 1988 ; Nov 1988 ; Dec 1988 ; Jan 1989 ; Feb 1989 ; Mar 1989 ; Apr 1989 ; May 1989 ; Jun 1989 ; Jul 1989 ; Aug 1989 ; Sep 1989 ; Oct 1989 ; Nov 1989 ; Dec 1989 ; Jan 1990 ; Feb 1990 ; Mar 1990 ; Apr 1990 ; May 1990 ; Jun 1990 ; Jul 1990 ; Aug 1990 ; Sep 1990 ; Oct 1990 ; Nov 1990 ; Dec 1990 ; Jan 1991 ; Feb 1991 ; Mar 1991 ; Apr 1991 ; May 1991 ; Jun 1991 ; Jul 1991 ; Aug 1991 ; Sep 1991 ; Oct 1991 ; Nov 1991 ; Dec 1991 ; Jan 1992 ; Feb 1992 ; Mar 1992 ; Apr 1992 ; May 1992 ; Jun 1992 ; Jul 1992 ; Aug 1992 ; Sep 1992 ; Oct 1992 ; Nov 1992 ; Dec 1992 ; Jan 1993 ; Feb 1993 ; Mar 1993 ; Apr 1993 ; May 1993 ; Jun 1993 ; Jul 1993 ; Aug 1993 ; Sep 1993 ; Oct 1993 ; Nov 1993 ; Dec 1993 ; Jan 1994 ; Feb 1994 ; Mar 1994 ; Apr 1994 ; May 1994 ; Jun 1994 ; Jul 1994 ; Aug 1994 ; Sep 1994 ; Oct 1994 ; Nov 1994 ; Dec 1994 ; Jan 1995 ; Feb 1995 ; Mar 1995 ; Apr 1995 ; May 1995 ; Jun 1995 ; Jul 1995 ; Aug 1995 ; Sep 1995 ; Oct 1995 ; Nov 1995 ; Dec 1995 ; Jan 1996 ; Feb 1996 ; Mar 1996 ; Apr 1996 ; May 1996 ; Jun 1996 ; Jul 1996 ; Aug 1996 ; Sep 1996 ; Oct 1996 ; Nov 1996 ; Dec 1996"
unemIndex2 <- "Jan 1997 ; Feb 1997 ; Mar 1997 ; Apr 1997 ; May 1997 ; Jun 1997 ; Jul 1997 ; Aug 1997 ; Sep 1997 ; Oct 1997 ; Nov 1997 ; Dec 1997 ; Jan 1998 ; Feb 1998 ; Mar 1998 ; Apr 1998 ; May 1998 ; Jun 1998 ; Jul 1998 ; Aug 1998 ; Sep 1998 ; Oct 1998 ; Nov 1998 ; Dec 1998 ; Jan 1999 ; Feb 1999 ; Mar 1999 ; Apr 1999 ; May 1999 ; Jun 1999 ; Jul 1999 ; Aug 1999 ; Sep 1999 ; Oct 1999 ; Nov 1999 ; Dec 1999 ; Jan 2000 ; Feb 2000 ; Mar 2000 ; Apr 2000 ; May 2000 ; Jun 2000 ; Jul 2000 ; Aug 2000 ; Sep 2000 ; Oct 2000 ; Nov 2000 ; Dec 2000 ; Jan 2001 ; Feb 2001 ; Mar 2001 ; Apr 2001 ; May 2001 ; Jun 2001 ; Jul 2001 ; Aug 2001 ; Sep 2001 ; Oct 2001 ; Nov 2001 ; Dec 2001 ; Jan 2002 ; Feb 2002 ; Mar 2002 ; Apr 2002 ; May 2002 ; Jun 2002 ; Jul 2002 ; Aug 2002 ; Sep 2002 ; Oct 2002 ; Nov 2002 ; Dec 2002 ; Jan 2003 ; Feb 2003 ; Mar 2003 ; Apr 2003 ; May 2003 ; Jun 2003 ; Jul 2003 ; Aug 2003 ; Sep 2003 ; Oct 2003 ; Nov 2003 ; Dec 2003 ; Jan 2004 ; Feb 2004 ; Mar 2004 ; Apr 2004 ; May 2004 ; Jun 2004 ; Jul 2004 ; Aug 2004 ; Sep 2004 ; Oct 2004 ; Nov 2004 ; Dec 2004 ; Jan 2005 ; Feb 2005 ; Mar 2005 ; Apr 2005 ; May 2005 ; Jun 2005 ; Jul 2005 ; Aug 2005 ; Sep 2005 ; Oct 2005 ; Nov 2005 ; Dec 2005 ; Jan 2006 ; Feb 2006 ; Mar 2006 ; Apr 2006 ; May 2006 ; Jun 2006 ; Jul 2006 ; Aug 2006 ; Sep 2006 ; Oct 2006 ; Nov 2006 ; Dec 2006 ; Jan 2007 ; Feb 2007 ; Mar 2007 ; Apr 2007 ; May 2007 ; Jun 2007 ; Jul 2007 ; Aug 2007 ; Sep 2007 ; Oct 2007 ; Nov 2007 ; Dec 2007 ; Jan 2008 ; Feb 2008 ; Mar 2008 ; Apr 2008 ; May 2008 ; Jun 2008 ; Jul 2008 ; Aug 2008 ; Sep 2008 ; Oct 2008 ; Nov 2008 ; Dec 2008 ; Jan 2009 ; Feb 2009 ; Mar 2009 ; Apr 2009 ; May 2009 ; Jun 2009 ; Jul 2009 ; Aug 2009 ; Sep 2009 ; Oct 2009 ; Nov 2009 ; Dec 2009 ; Jan 2010 ; Feb 2010 ; Mar 2010 ; Apr 2010 ; May 2010 ; Jun 2010 ; Jul 2010 ; Aug 2010 ; Sep 2010 ; Oct 2010 ; Nov 2010 ; Dec 2010 ; Jan 2011 ; Feb 2011 ; Mar 2011 ; Apr 2011 ; May 2011 ; Jun 2011 ; Jul 2011 ; Aug 2011 ; Sep 2011 ; Oct 2011 ; Nov 2011 ; Dec 2011 ; Jan 2012 ; Feb 2012 ; Mar 2012 ; Apr 2012 ; May 2012 ; Jun 2012 ; Jul 2012 ; Aug 2012 ; Sep 2012 ; Oct 2012 ; Nov 2012 ; Dec 2012 ; Jan 2013 ; Feb 2013 ; Mar 2013 ; Apr 2013 ; May 2013 ; Jun 2013 ; Jul 2013 ; Aug 2013 ; Sep 2013 ; Oct 2013 ; Nov 2013 ; Dec 2013 ; Jan 2014 ; Feb 2014 ; Mar 2014 ; Apr 2014 ; May 2014 ; Jun 2014 ; Jul 2014 ; Aug 2014 ; Sep 2014 ; Oct 2014 ; Nov 2014 ; Dec 2014 ; Jan 2015 ; Feb 2015 ; Mar 2015 ; Apr 2015 ; May 2015 ; Jun 2015 ; Jul 2015 ; Aug 2015 ; Sep 2015 ; Oct 2015 ; Nov 2015 ; Dec 2015"
unemCore <- paste(unemCore1, unemCore2, sep=" ; ")
unemIndex <- paste(unemIndex1, unemIndex2, sep=" ; ")
mtxCore <- matrix(data=as.numeric(strsplit(unemCore, " ; ")[[1]]), ncol=2, byrow=FALSE)
## Warning in matrix(data = as.numeric(strsplit(unemCore, " ; ")[[1]]), ncol =
## 2, : NAs introduced by coercion
colnames(mtxCore) <- c("us", "ma")
vecIndex <- as.yearmon(strsplit(unemIndex, " ; ")[[1]], "%b %Y")
unemployment <- as.xts(mtxCore, order.by=vecIndex)
str(unemployment)
## An 'xts' object on Jan 1976/Dec 2015 containing:
## Data: num [1:480, 1:2] 7.9 7.7 7.6 7.7 7.4 7.6 NA NA 7.6 7.7 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:2] "us" "ma"
## Indexed by objects of class: [yearmon] TZ:
## xts Attributes:
## NULL
# View a summary of your unemployment data
summary(unemployment)
## Index us ma
## Min. :1976 Min. : 3.800 Min. : 2.100
## 1st Qu.:1986 1st Qu.: 5.300 1st Qu.: 4.300
## Median :1996 Median : 6.000 Median : 5.500
## Mean :1996 Mean : 6.365 Mean : 5.612
## 3rd Qu.:2006 3rd Qu.: 7.300 3rd Qu.: 6.800
## Max. :2016 Max. :10.800 Max. :11.600
## NA's :73 NA's :20
# Use na.approx to remove missing values in unemployment data
unemployment <- na.approx(unemployment)
# Plot new unemployment data
lty <- c(1, 2)
plot.zoo(unemployment, plot.type = "single", lty = lty)
labels <- c("US Unemployment (%)" , "MA Unemployment (%)")
legend("topright", lty = lty, legend = labels, bg = "white")
# Create a one month lag of US unemployment
us_monthlag <- stats::lag(unemployment$us, k = 1) # caution that dplyr::lag can mask stats::lag
# Create a one year lag of US unemployment
us_yearlag <- stats::lag(unemployment$us, k = 12) # caution that dplyr::lag can mask stats::lag
# Merge your original data with your new lags
unemployment_lags <- merge(unemployment, us_monthlag, us_yearlag)
# View the first 15 rows of unemployment_lags
head(unemployment_lags, n=15)
## us ma us.1 us.2
## Jan 1976 7.90 11.60 NA NA
## Feb 1976 7.70 11.25 7.90 NA
## Mar 1976 7.60 10.90 7.70 NA
## Apr 1976 7.70 10.15 7.60 NA
## May 1976 7.40 9.40 7.70 NA
## Jun 1976 7.60 9.80 7.40 NA
## Jul 1976 7.60 9.70 7.60 NA
## Aug 1976 7.60 9.00 7.60 NA
## Sep 1976 7.60 9.00 7.60 NA
## Oct 1976 7.70 8.30 7.60 NA
## Nov 1976 7.80 8.30 7.70 NA
## Dec 1976 7.65 8.20 7.80 NA
## Jan 1977 7.50 9.50 7.65 7.9
## Feb 1977 7.60 9.20 7.50 7.7
## Mar 1977 7.40 8.80 7.60 7.6
# Generate monthly difference in unemployment
unemployment$us_monthlydiff <- diff(unemployment$us, lag = 1, differences = 1)
# Generate yearly difference in unemployment
unemployment$us_yearlydiff <- diff(unemployment$us, lag = 12, differences = 1)
# Plot US unemployment and annual difference
par(mfrow = c(2,1))
plot.xts(unemployment$us)
plot.xts(unemployment$us_yearlydiff, type = "h")
par(mfrow=c(1, 1))
# Add a quarterly difference in gdp
gdp_xts <- na.approx(gdp_xts)
gdp_xts$quarterly_diff <- diff(gdp_xts$gdp, lag = 1, differences = 1)
# Split gdp$quarterly_diff into years
gdpchange_years <- split(gdp_xts$quarterly_diff, f = "years")
# Use lapply to calculate the cumsum each year
gdpchange_ytd <- lapply(gdpchange_years, FUN = cumsum)
# Use do.call to rbind the results
gdpchange_xts <- do.call(rbind, gdpchange_ytd)
# Plot cumulative year-to-date change in GDP
plot.xts(gdpchange_xts, type = "h")
# Use rollapply to calculate the rolling yearly average US unemployment
unemployment$year_avg <- rollapply(unemployment$us, width = 12, FUN = mean)
# Plot all columns of US unemployment data
lty <- c(2, 1)
lwd <- c(1, 2)
plot.zoo(unemployment[, c("us", "year_avg")], plot.type = "single", lty = lty, lwd = lwd)
# Add a one-year lag of MA unemployment
unemployment$ma_yearlag <- stats::lag(unemployment$ma, k=12) # caution that dplyr::lag can mask stats::lag
# Add a six-month difference of MA unemployment
unemployment$ma_sixmonthdiff <- diff(unemployment$ma, lag=6, differences=1)
# Add a six-month rolling average of MA unemployment
unemployment$ma_sixmonthavg <- rollapply(unemployment$ma, width=6, FUN=mean)
# Add a yearly rolling maximum of MA unemployment
unemployment$ma_yearmax <- rollapply(unemployment$ma, width=12, FUN=max)
# View the last year of unemployment data
tail(unemployment, n=12)
## us ma us_monthlydiff us_yearlydiff year_avg ma_yearlag
## Jan 2015 5.7 5.8 0.1 -0.9 6.091667 6.80
## Feb 2015 5.5 5.5 -0.2 -1.2 5.991667 6.50
## Mar 2015 5.5 5.2 0.0 -1.2 5.891667 6.20
## Apr 2015 5.4 4.7 -0.1 -0.8 5.825000 5.50
## May 2015 5.5 4.9 0.1 -0.7 5.766667 5.50
## Jun 2015 5.3 5.2 -0.2 -0.8 5.700000 6.00
## Jul 2015 5.3 5.2 0.0 -0.9 5.625000 6.00
## Aug 2015 5.1 4.7 -0.2 -1.1 5.533333 5.85
## Sep 2015 5.1 4.9 0.0 -0.9 5.458333 5.70
## Oct 2015 5.0 4.5 -0.1 -0.7 5.400000 5.00
## Nov 2015 5.0 4.5 0.0 -0.8 5.333333 5.00
## Dec 2015 5.0 4.6 0.0 -0.6 5.283333 4.90
## ma_sixmonthdiff ma_sixmonthavg ma_yearmax
## Jan 2015 -0.20 5.375000 6.50
## Feb 2015 -0.35 5.316667 6.20
## Mar 2015 -0.50 5.233333 6.00
## Apr 2015 -0.30 5.183333 6.00
## May 2015 -0.10 5.166667 6.00
## Jun 2015 0.30 5.216667 6.00
## Jul 2015 -0.60 5.116667 5.85
## Aug 2015 -0.80 4.983333 5.80
## Sep 2015 -0.30 4.933333 5.80
## Oct 2015 -0.20 4.900000 5.80
## Nov 2015 -0.40 4.833333 5.80
## Dec 2015 -0.60 4.733333 5.80
Chapter 4 - Sports Data
Advanced features of xts:
Indexing commands in xts:
Example code includes:
rsDate1 <- "2010-04-04 ; 2010-04-06 ; 2010-04-07 ; 2010-04-16 ; 2010-04-17 ; 2010-04-18 ; 2010-04-19 ; 2010-04-20 ; 2010-04-21 ; 2010-04-22 ; 2010-04-23 ; 2010-04-24 ; 2010-04-25 ; 2010-05-03 ; 2010-05-04 ; 2010-05-05 ; 2010-05-06 ; 2010-05-07 ; 2010-05-08 ; 2010-05-09 ; 2010-05-10 ; 2010-05-11 ; 2010-05-12 ; 2010-05-19 ; 2010-05-20 ; 2010-05-27 ; 2010-05-28 ; 2010-05-29 ; 2010-05-30 ; 2010-06-01 ; 2010-06-02 ; 2010-06-03 ; 2010-06-11 ; 2010-06-12 ; 2010-06-13 ; 2010-06-15 ; 2010-06-16 ; 2010-06-17 ; 2010-06-18 ; 2010-06-19 ; 2010-06-20 ; 2010-06-29 ; 2010-06-30 ; 2010-07-02 ; 2010-07-03 ; 2010-07-04 ; 2010-07-15 ; 2010-07-16 ; 2010-07-17 ; 2010-07-18 ; 2010-07-30 ; 2010-07-31 ; 2010-08-01 ; 2010-08-02 ; 2010-08-03 ; 2010-08-04 ; 2010-08-05 ; 2010-08-17 ; 2010-08-18 ; 2010-08-19 ; 2010-08-20 ; 2010-08-21 ; 2010-08-22 ; 2010-08-23 ; 2010-08-25 ; 2010-08-25 ; 2010-09-04 ; 2010-09-04 ; 2010-09-05 ; 2010-09-06 ; 2010-09-07 ; 2010-09-08 ; 2010-09-17 ; 2010-09-18 ; 2010-09-19 ; 2010-09-20 ; 2010-09-21 ; 2010-09-22 ; 2010-10-02 ; 2010-10-02 ; 2010-10-03 ; 2011-04-08 ; 2011-04-09 ; 2011-04-10 ; 2011-04-11 ; 2011-04-12 ; 2011-04-15 ; 2011-04-16 ; 2011-04-17 ; 2011-04-18 ; 2011-04-29 ; 2011-04-30 ; 2011-05-01 ; 2011-05-02 ; 2011-05-03 ; 2011-05-04 ; 2011-05-05 ; 2011-05-06 ; 2011-05-07 ; 2011-05-08 ; 2011-05-09 ; 2011-05-16 ; 2011-05-18 ; 2011-05-19 ; 2011-05-20 ; 2011-05-21 ; 2011-05-22 ; 2011-05-30 ; 2011-05-31 ; 2011-06-01 ; 2011-06-03 ; 2011-06-04 ; 2011-06-05 ; 2011-06-17 ; 2011-06-18 ; 2011-06-19 ; 2011-06-20 ; 2011-06-21 ; 2011-06-22 ; 2011-07-04 ; 2011-07-05 ; 2011-07-06 ; 2011-07-07 ; 2011-07-08 ; 2011-07-09 ; 2011-07-10 ; 2011-07-22 ; 2011-07-23 ; 2011-07-24 ; 2011-07-25 ; 2011-07-26 ; 2011-07-27 ; 2011-07-28 ; 2011-08-01 ; 2011-08-02 ; 2011-08-03 ; 2011-08-04 ; 2011-08-05 ; 2011-08-06 ; 2011-08-07 ; 2011-08-16 ; 2011-08-16 ; 2011-08-17 ; 2011-08-26 ; 2011-08-27 ; 2011-08-27 ; 2011-08-30 ; 2011-08-31 ; 2011-09-01 ; 2011-09-02 ; 2011-09-03 ; 2011-09-04 ; 2011-09-13 ; 2011-09-14 ; 2011-09-15 ; 2011-09-16 ; 2011-09-17 ; 2011-09-18 ; 2011-09-19 ; 2011-09-19 ; 2011-09-20 ; 2011-09-21 ; 2012-04-13 ; 2012-04-14 ; 2012-04-15 ; 2012-04-16 ; 2012-04-17 ; 2012-04-18 ; 2012-04-20 ; 2012-04-21 ; 2012-04-30 ; 2012-05-01 ; 2012-05-02 ; 2012-05-04 ; 2012-05-05 ; 2012-05-06 ; 2012-05-10 ; 2012-05-11 ; 2012-05-12 ; 2012-05-13 ; 2012-05-14 ; 2012-05-15 ; 2012-05-25 ; 2012-05-26 ; 2012-05-27 ; 2012-05-28 ; 2012-05-29 ; 2012-05-30 ; 2012-05-31 ; 2012-06-05 ; 2012-06-06 ; 2012-06-07 ; 2012-06-08 ; 2012-06-09 ; 2012-06-10 ; 2012-06-19 ; 2012-06-20 ; 2012-06-21 ; 2012-06-22 ; 2012-06-23 ; 2012-06-24 ; 2012-06-25 ; 2012-06-26 ; 2012-06-27 ; 2012-07-06 ; 2012-07-07 ; 2012-07-07 ; 2012-07-08 ; 2012-07-16 ; 2012-07-17 ; 2012-07-18 ; 2012-07-19 ; 2012-07-20 ; 2012-07-21 ; 2012-07-22 ; 2012-07-30 ; 2012-07-31 ; 2012-08-01 ; 2012-08-02 ; 2012-08-03 ; 2012-08-04 ; 2012-08-05 ; 2012-08-06 ; 2012-08-07 ; 2012-08-08 ; 2012-08-21 ; 2012-08-22 ; 2012-08-23 ; 2012-08-24 ; 2012-08-25 ; 2012-08-26 ; 2012-08-27 ; 2012-09-07 ; 2012-09-08 ; 2012-09-09 ; 2012-09-11 ; 2012-09-12 ; 2012-09-13 ; 2012-09-21 ; 2012-09-22 ; 2012-09-23 ; 2012-09-25 ; 2012-09-26"
rsDate2 <- "2013-04-08 ; 2013-04-10 ; 2013-04-11 ; 2013-04-13 ; 2013-04-14 ; 2013-04-15 ; 2013-04-20 ; 2013-04-21 ; 2013-04-21 ; 2013-04-22 ; 2013-04-23 ; 2013-04-24 ; 2013-04-25 ; 2013-04-26 ; 2013-04-27 ; 2013-04-28 ; 2013-05-06 ; 2013-05-07 ; 2013-05-08 ; 2013-05-09 ; 2013-05-10 ; 2013-05-11 ; 2013-05-12 ; 2013-05-23 ; 2013-05-24 ; 2013-05-25 ; 2013-05-26 ; 2013-05-27 ; 2013-05-28 ; 2013-06-04 ; 2013-06-05 ; 2013-06-06 ; 2013-06-08 ; 2013-06-08 ; 2013-06-09 ; 2013-06-18 ; 2013-06-18 ; 2013-06-19 ; 2013-06-25 ; 2013-06-26 ; 2013-06-27 ; 2013-06-28 ; 2013-06-29 ; 2013-06-30 ; 2013-07-02 ; 2013-07-03 ; 2013-07-04 ; 2013-07-19 ; 2013-07-20 ; 2013-07-21 ; 2013-07-22 ; 2013-07-23 ; 2013-07-24 ; 2013-07-29 ; 2013-07-30 ; 2013-07-31 ; 2013-08-01 ; 2013-08-02 ; 2013-08-03 ; 2013-08-04 ; 2013-08-16 ; 2013-08-17 ; 2013-08-18 ; 2013-08-27 ; 2013-08-28 ; 2013-08-29 ; 2013-08-30 ; 2013-08-31 ; 2013-09-01 ; 2013-09-02 ; 2013-09-03 ; 2013-09-04 ; 2013-09-13 ; 2013-09-14 ; 2013-09-15 ; 2013-09-17 ; 2013-09-18 ; 2013-09-19 ; 2013-09-20 ; 2013-09-21 ; 2013-09-22 ; 2014-04-04 ; 2014-04-05 ; 2014-04-06 ; 2014-04-07 ; 2014-04-08 ; 2014-04-09 ; 2014-04-18 ; 2014-04-19 ; 2014-04-20 ; 2014-04-21 ; 2014-04-22 ; 2014-04-23 ; 2014-04-24 ; 2014-04-29 ; 2014-05-01 ; 2014-05-01 ; 2014-05-02 ; 2014-05-03 ; 2014-05-04 ; 2014-05-06 ; 2014-05-07 ; 2014-05-16 ; 2014-05-17 ; 2014-05-18 ; 2014-05-20 ; 2014-05-21 ; 2014-05-22 ; 2014-05-28 ; 2014-05-29 ; 2014-05-30 ; 2014-05-31 ; 2014-06-01 ; 2014-06-12 ; 2014-06-13 ; 2014-06-14 ; 2014-06-15 ; 2014-06-16 ; 2014-06-17 ; 2014-06-18 ; 2014-06-30 ; 2014-07-01 ; 2014-07-02 ; 2014-07-05 ; 2014-07-05 ; 2014-07-06 ; 2014-07-07 ; 2014-07-08 ; 2014-07-09 ; 2014-07-10 ; 2014-07-18 ; 2014-07-19 ; 2014-07-20 ; 2014-07-28 ; 2014-07-29 ; 2014-07-30 ; 2014-08-01 ; 2014-08-02 ; 2014-08-03 ; 2014-08-14 ; 2014-08-15 ; 2014-08-16 ; 2014-08-17 ; 2014-08-18 ; 2014-08-19 ; 2014-08-20 ; 2014-08-21 ; 2014-08-22 ; 2014-08-23 ; 2014-08-24 ; 2014-09-05 ; 2014-09-06 ; 2014-09-07 ; 2014-09-08 ; 2014-09-09 ; 2014-09-10 ; 2014-09-23 ; 2014-09-24 ; 2014-09-25 ; 2014-09-26 ; 2014-09-27 ; 2014-09-28 ; 2015-04-13 ; 2015-04-14 ; 2015-04-15 ; 2015-04-17 ; 2015-04-18 ; 2015-04-19 ; 2015-04-20 ; 2015-04-27 ; 2015-04-28 ; 2015-04-29 ; 2015-05-01 ; 2015-05-02 ; 2015-05-03 ; 2015-05-04 ; 2015-05-05 ; 2015-05-06 ; 2015-05-19 ; 2015-05-20 ; 2015-05-21 ; 2015-05-22 ; 2015-05-23 ; 2015-05-24 ; 2015-06-02 ; 2015-06-03 ; 2015-06-03 ; 2015-06-04 ; 2015-06-05 ; 2015-06-06 ; 2015-06-07 ; 2015-06-12 ; 2015-06-13 ; 2015-06-14 ; 2015-06-15 ; 2015-06-16 ; 2015-06-23 ; 2015-06-24 ; 2015-06-25 ; 2015-07-03 ; 2015-07-04 ; 2015-07-05 ; 2015-07-07 ; 2015-07-08 ; 2015-07-10 ; 2015-07-11 ; 2015-07-12 ; 2015-07-24 ; 2015-07-25 ; 2015-07-26 ; 2015-07-27 ; 2015-07-28 ; 2015-07-29 ; 2015-07-30 ; 2015-07-31 ; 2015-08-01 ; 2015-08-02 ; 2015-08-14 ; 2015-08-15 ; 2015-08-16 ; 2015-08-17 ; 2015-08-18 ; 2015-08-19 ; 2015-08-20 ; 2015-08-21 ; 2015-08-22 ; 2015-08-23 ; 2015-08-31 ; 2015-09-01 ; 2015-09-02 ; 2015-09-04 ; 2015-09-05 ; 2015-09-06 ; 2015-09-07 ; 2015-09-08 ; 2015-09-09 ; 2015-09-21 ; 2015-09-22 ; 2015-09-23 ; 2015-09-24 ; 2015-09-25 ; 2015-09-26 ; 2015-09-27"
rsDate3 <- "2010-04-09 ; 2010-04-10 ; 2010-04-11 ; 2010-04-12 ; 2010-04-14 ; 2010-04-15 ; 2010-04-26 ; 2010-04-27 ; 2010-04-28 ; 2010-04-30 ; 2010-05-01 ; 2010-05-02 ; 2010-05-14 ; 2010-05-15 ; 2010-05-16 ; 2010-05-17 ; 2010-05-18 ; 2010-05-21 ; 2010-05-22 ; 2010-05-23 ; 2010-05-24 ; 2010-05-25 ; 2010-05-26 ; 2010-06-04 ; 2010-06-05 ; 2010-06-06 ; 2010-06-07 ; 2010-06-08 ; 2010-06-09 ; 2010-06-10 ; 2010-06-22 ; 2010-06-23 ; 2010-06-24 ; 2010-06-25 ; 2010-06-26 ; 2010-06-27 ; 2010-07-05 ; 2010-07-06 ; 2010-07-07 ; 2010-07-09 ; 2010-07-10 ; 2010-07-11 ; 2010-07-19 ; 2010-07-20 ; 2010-07-21 ; 2010-07-22 ; 2010-07-23 ; 2010-07-24 ; 2010-07-25 ; 2010-07-26 ; 2010-07-27 ; 2010-07-28 ; 2010-08-06 ; 2010-08-07 ; 2010-08-08 ; 2010-08-09 ; 2010-08-10 ; 2010-08-11 ; 2010-08-12 ; 2010-08-13 ; 2010-08-14 ; 2010-08-15 ; 2010-08-27 ; 2010-08-28 ; 2010-08-29 ; 2010-08-31 ; 2010-09-01 ; 2010-09-02 ; 2010-09-10 ; 2010-09-11 ; 2010-09-12 ; 2010-09-13 ; 2010-09-14 ; 2010-09-15 ; 2010-09-24 ; 2010-09-25 ; 2010-09-26 ; 2010-09-27 ; 2010-09-28 ; 2010-09-29 ; 2010-09-30 ; 2011-04-01 ; 2011-04-02 ; 2011-04-03 ; 2011-04-05 ; 2011-04-06 ; 2011-04-07 ; 2011-04-19 ; 2011-04-20 ; 2011-04-21 ; 2011-04-22 ; 2011-04-23 ; 2011-04-24 ; 2011-04-26 ; 2011-04-27 ; 2011-04-28 ; 2011-05-10 ; 2011-05-11 ; 2011-05-13 ; 2011-05-14 ; 2011-05-15 ; 2011-05-23 ; 2011-05-24 ; 2011-05-25 ; 2011-05-26 ; 2011-05-27 ; 2011-05-29 ; 2011-05-29 ; 2011-06-07 ; 2011-06-08 ; 2011-06-09 ; 2011-06-10 ; 2011-06-11 ; 2011-06-12 ; 2011-06-14 ; 2011-06-15 ; 2011-06-16 ; 2011-06-24 ; 2011-06-25 ; 2011-06-26 ; 2011-06-28 ; 2011-06-29 ; 2011-06-30 ; 2011-07-01 ; 2011-07-02 ; 2011-07-03 ; 2011-07-15 ; 2011-07-16 ; 2011-07-17 ; 2011-07-18 ; 2011-07-19 ; 2011-07-20 ; 2011-07-29 ; 2011-07-30 ; 2011-07-31 ; 2011-08-08 ; 2011-08-09 ; 2011-08-10 ; 2011-08-12 ; 2011-08-13 ; 2011-08-14 ; 2011-08-18 ; 2011-08-19 ; 2011-08-20 ; 2011-08-21 ; 2011-08-22 ; 2011-08-23 ; 2011-08-24 ; 2011-08-25 ; 2011-09-05 ; 2011-09-06 ; 2011-09-07 ; 2011-09-08 ; 2011-09-09 ; 2011-09-10 ; 2011-09-11 ; 2011-09-24 ; 2011-09-25 ; 2011-09-25 ; 2011-09-26 ; 2011-09-27 ; 2011-09-28 ; 2012-04-05 ; 2012-04-07 ; 2012-04-08 ; 2012-04-09 ; 2012-04-10 ; 2012-04-11 ; 2012-04-23 ; 2012-04-24 ; 2012-04-25 ; 2012-04-26 ; 2012-04-27 ; 2012-04-28 ; 2012-04-29 ; 2012-05-07 ; 2012-05-08 ; 2012-05-09 ; 2012-05-16 ; 2012-05-17 ; 2012-05-18 ; 2012-05-19 ; 2012-05-20 ; 2012-05-21 ; 2012-05-22 ; 2012-05-23 ; 2012-06-01 ; 2012-06-02 ; 2012-06-03 ; 2012-06-11 ; 2012-06-12 ; 2012-06-13 ; 2012-06-15 ; 2012-06-16 ; 2012-06-17 ; 2012-06-28 ; 2012-06-29 ; 2012-06-30 ; 2012-07-01 ; 2012-07-02 ; 2012-07-03 ; 2012-07-04 ; 2012-07-13 ; 2012-07-14 ; 2012-07-15 ; 2012-07-23 ; 2012-07-24 ; 2012-07-25 ; 2012-07-27 ; 2012-07-28 ; 2012-07-29 ; 2012-08-09 ; 2012-08-10 ; 2012-08-11 ; 2012-08-12 ; 2012-08-14 ; 2012-08-15 ; 2012-08-16 ; 2012-08-17 ; 2012-08-18 ; 2012-08-19 ; 2012-08-28 ; 2012-08-29 ; 2012-08-30 ; 2012-08-31 ; 2012-09-01 ; 2012-09-02 ; 2012-09-03 ; 2012-09-04 ; 2012-09-05 ; 2012-09-14 ; 2012-09-15 ; 2012-09-16 ; 2012-09-17 ; 2012-09-18 ; 2012-09-19 ; 2012-09-20 ; 2012-09-28 ; 2012-09-29 ; 2012-09-30 ; 2012-10-01 ; 2012-10-02 ; 2012-10-03"
rsDate4 <- "2013-04-01 ; 2013-04-03 ; 2013-04-04 ; 2013-04-05 ; 2013-04-06 ; 2013-04-07 ; 2013-04-16 ; 2013-04-17 ; 2013-04-18 ; 2013-04-30 ; 2013-05-01 ; 2013-05-02 ; 2013-05-03 ; 2013-05-04 ; 2013-05-05 ; 2013-05-14 ; 2013-05-15 ; 2013-05-16 ; 2013-05-17 ; 2013-05-18 ; 2013-05-19 ; 2013-05-20 ; 2013-05-21 ; 2013-05-22 ; 2013-05-29 ; 2013-05-30 ; 2013-05-31 ; 2013-06-01 ; 2013-06-02 ; 2013-06-10 ; 2013-06-11 ; 2013-06-12 ; 2013-06-13 ; 2013-06-14 ; 2013-06-15 ; 2013-06-16 ; 2013-06-20 ; 2013-06-21 ; 2013-06-22 ; 2013-06-23 ; 2013-07-05 ; 2013-07-06 ; 2013-07-07 ; 2013-07-08 ; 2013-07-09 ; 2013-07-10 ; 2013-07-11 ; 2013-07-12 ; 2013-07-13 ; 2013-07-14 ; 2013-07-26 ; 2013-07-27 ; 2013-07-28 ; 2013-08-05 ; 2013-08-06 ; 2013-08-07 ; 2013-08-08 ; 2013-08-09 ; 2013-08-10 ; 2013-08-11 ; 2013-08-13 ; 2013-08-14 ; 2013-08-15 ; 2013-08-19 ; 2013-08-20 ; 2013-08-21 ; 2013-08-23 ; 2013-08-24 ; 2013-08-25 ; 2013-09-05 ; 2013-09-06 ; 2013-09-07 ; 2013-09-08 ; 2013-09-10 ; 2013-09-11 ; 2013-09-12 ; 2013-09-24 ; 2013-09-25 ; 2013-09-27 ; 2013-09-28 ; 2013-09-29 ; 2014-03-31 ; 2014-04-02 ; 2014-04-03 ; 2014-04-10 ; 2014-04-11 ; 2014-04-12 ; 2014-04-13 ; 2014-04-15 ; 2014-04-16 ; 2014-04-17 ; 2014-04-25 ; 2014-04-26 ; 2014-04-27 ; 2014-05-09 ; 2014-05-10 ; 2014-05-11 ; 2014-05-13 ; 2014-05-14 ; 2014-05-15 ; 2014-05-23 ; 2014-05-24 ; 2014-05-25 ; 2014-05-26 ; 2014-05-27 ; 2014-06-02 ; 2014-06-03 ; 2014-06-04 ; 2014-06-06 ; 2014-06-07 ; 2014-06-08 ; 2014-06-09 ; 2014-06-10 ; 2014-06-11 ; 2014-06-19 ; 2014-06-20 ; 2014-06-21 ; 2014-06-22 ; 2014-06-23 ; 2014-06-24 ; 2014-06-25 ; 2014-06-27 ; 2014-06-28 ; 2014-06-29 ; 2014-07-11 ; 2014-07-12 ; 2014-07-13 ; 2014-07-21 ; 2014-07-22 ; 2014-07-23 ; 2014-07-24 ; 2014-07-25 ; 2014-07-26 ; 2014-07-27 ; 2014-08-05 ; 2014-08-06 ; 2014-08-07 ; 2014-08-08 ; 2014-08-09 ; 2014-08-10 ; 2014-08-12 ; 2014-08-13 ; 2014-08-25 ; 2014-08-26 ; 2014-08-27 ; 2014-08-29 ; 2014-08-30 ; 2014-08-31 ; 2014-09-01 ; 2014-09-02 ; 2014-09-03 ; 2014-09-04 ; 2014-09-11 ; 2014-09-12 ; 2014-09-13 ; 2014-09-14 ; 2014-09-16 ; 2014-09-17 ; 2014-09-18 ; 2014-09-19 ; 2014-09-20 ; 2014-09-21 ; 2015-04-06 ; 2015-04-08 ; 2015-04-09 ; 2015-04-10 ; 2015-04-11 ; 2015-04-12 ; 2015-04-21 ; 2015-04-22 ; 2015-04-23 ; 2015-04-24 ; 2015-04-25 ; 2015-04-26 ; 2015-05-08 ; 2015-05-09 ; 2015-05-10 ; 2015-05-11 ; 2015-05-12 ; 2015-05-13 ; 2015-05-14 ; 2015-05-15 ; 2015-05-16 ; 2015-05-17 ; 2015-05-25 ; 2015-05-26 ; 2015-05-27 ; 2015-05-28 ; 2015-05-29 ; 2015-05-30 ; 2015-05-31 ; 2015-06-09 ; 2015-06-10 ; 2015-06-11 ; 2015-06-17 ; 2015-06-18 ; 2015-06-19 ; 2015-06-20 ; 2015-06-21 ; 2015-06-26 ; 2015-06-27 ; 2015-06-28 ; 2015-06-29 ; 2015-06-30 ; 2015-07-01 ; 2015-07-02 ; 2015-07-17 ; 2015-07-18 ; 2015-07-20 ; 2015-07-20 ; 2015-07-21 ; 2015-07-22 ; 2015-07-23 ; 2015-08-04 ; 2015-08-05 ; 2015-08-06 ; 2015-08-07 ; 2015-08-08 ; 2015-08-09 ; 2015-08-11 ; 2015-08-12 ; 2015-08-24 ; 2015-08-25 ; 2015-08-26 ; 2015-08-28 ; 2015-08-29 ; 2015-08-30 ; 2015-09-11 ; 2015-09-12 ; 2015-09-13 ; 2015-09-14 ; 2015-09-15 ; 2015-09-16 ; 2015-09-18 ; 2015-09-19 ; 2015-09-20 ; 2015-09-28 ; 2015-09-29 ; 2015-09-30 ; 2015-10-01 ; 2015-10-02 ; 2015-10-03 ; 2015-10-04"
rsDate <- paste(rsDate1, rsDate2, rsDate3, rsDate4, sep=" ; ")
rsScore <- "9 ; 4 ; 1 ; 1 ; 5 ; 1 ; 2 ; 7 ; 8 ; 0 ; 4 ; 7 ; 6 ; 17 ; 5 ; 3 ; 11 ; 3 ; 3 ; 9 ; 7 ; 6 ; 2 ; 3 ; 6 ; 3 ; 5 ; 1 ; 8 ; 9 ; 6 ; 8 ; 12 ; 10 ; 3 ; 6 ; 6 ; 8 ; 10 ; 5 ; 2 ; 8 ; 4 ; 3 ; 9 ; 1 ; 2 ; 4 ; 3 ; 2 ; 5 ; 5 ; 4 ; 5 ; 3 ; 1 ; 6 ; 6 ; 7 ; 2 ; 2 ; 5 ; 5 ; 6 ; 5 ; 2 ; 1 ; 1 ; 5 ; 12 ; 5 ; 11 ; 9 ; 3 ; 6 ; 2 ; 1 ; 6 ; 5 ; 7 ; 8 ; 9 ; 4 ; 4 ; 5 ; 2 ; 6 ; 4 ; 8 ; 9 ; 4 ; 0 ; 3 ; 9 ; 7 ; 3 ; 0 ; 2 ; 4 ; 9 ; 2 ; 8 ; 1 ; 4 ; 15 ; 3 ; 5 ; 3 ; 7 ; 4 ; 8 ; 9 ; 6 ; 10 ; 2 ; 12 ; 14 ; 4 ; 1 ; 7 ; 3 ; 6 ; 10 ; 10 ; 4 ; 8 ; 7 ; 3 ; 12 ; 1 ; 13 ; 12 ; 3 ; 6 ; 3 ; 4 ; 3 ; 2 ; 10 ; 3 ; 3 ; 2 ; 0 ; 5 ; 9 ; 4 ; 2 ; 9 ; 2 ; 0 ; 12 ; 4 ; 18 ; 4 ; 2 ; 4 ; 3 ; 5 ; 5 ; 18 ; 5 ; 4 ; 12 ; 13 ; 6 ; 0 ; 3 ; 3 ; 2 ; 9 ; 11 ; 3 ; 2 ; 4 ; 2 ; 6 ; 3 ; 7 ; 4 ; 12 ; 6 ; 5 ; 4 ; 3 ; 3 ; 7 ; 6 ; 6 ; 3 ; 6 ; 1 ; 7 ; 4 ; 2 ; 3 ; 7 ; 15 ; 6 ; 1 ; 8 ; 9 ; 6 ; 5 ; 10 ; 8 ; 1 ; 9 ; 3 ; 5 ; 5 ; 10 ; 3 ; 1 ; 3 ; 7 ; 7 ; 4 ; 5 ; 0 ; 5 ; 4 ; 6 ; 9 ; 3 ; 9 ; 3 ; 3 ; 13 ; 4 ; 9 ; 8 ; 5 ; 5 ; 2 ; 3 ; 4 ; 4 ; 0 ; 2 ; 6 ; 2 ; 2 ; 2 ; 3 ; 5 ; 2 ; 2 ; 5 ; 3 ; 4 ; 2 ; 4 ; 9 ; 0 ; 6 ; 7 ; 7 ; 8 ; 6 ; 6 ; 1 ; 8 ; 3 ; 5 ; 2 ; 4 ; 3 ; 8 ; 7 ; 6 ; 9 ; 1 ; 17 ; 2 ; 6 ; 5 ; 7 ; 10 ; 5 ; 3 ; 2 ; 11 ; 5 ; 7 ; 7 ; 2 ; 5 ; 4 ; 2 ; 8 ; 4 ; 2 ; 8 ; 0 ; 6 ; 1 ; 1 ; 8 ; 5 ; 8 ; 6 ; 5 ; 4 ; 3 ; 6 ; 6 ; 13 ; 4 ; 2 ; 4 ; 7 ; 7 ; 0 ; 2 ; 20 ; 8 ; 5 ; 9 ; 2 ; 3 ; 3 ; 6 ; 2 ; 5 ; 2 ; 6 ; 0 ; 5 ; 7 ; 4 ; 4 ; 4 ; 6 ; 6 ; 3 ; 5 ; 5 ; 7 ; 1 ; 5 ; 7 ; 6 ; 2 ; 4 ; 4 ; 0 ; 1 ; 2 ; 4 ; 4 ; 2 ; 4 ; 4 ; 3 ; 7 ; 4 ; 5 ; 10 ; 2 ; 2 ; 1 ; 2 ; 2 ; 0 ; 1 ; 9 ; 3 ; 4 ; 6 ; 0 ; 3 ; 5 ; 4 ; 5 ; 2 ; 6 ; 1 ; 2 ; 1 ; 4 ; 4 ; 7 ; 9 ; 3 ; 10 ; 1 ; 2 ; 3 ; 3 ; 0 ; 3 ; 3 ; 6 ; 9 ; 4 ; 1 ; 0 ; 1 ; 6 ; 2 ; 11 ; 11 ; 2 ; 10 ; 5 ; 9 ; 8 ; 5 ; 3 ; 1 ; 3 ; 7 ; 6 ; 8 ; 4 ; 2 ; 2 ; 5 ; 1 ; 2 ; 3 ; 4 ; 1 ; 1 ; 5 ; 8 ; 6 ; 1 ; 6 ; 0 ; 4 ; 4 ; 4 ; 7 ; 10 ; 4 ; 5 ; 2 ; 9 ; 4 ; 5 ; 6 ; 8 ; 6 ; 5 ; 4 ; 6 ; 1 ; 5 ; 6 ; 2 ; 1 ; 11 ; 8 ; 4 ; 2 ; 8 ; 7 ; 11 ; 3 ; 15 ; 22 ; 8 ; 2 ; 9 ; 6 ; 4 ; 7 ; 3 ; 6 ; 4 ; 1 ; 8 ; 7 ; 9 ; 6 ; 11 ; 1 ; 10 ; 8 ; 2 ; 2 ; 2 ; 7 ; 8 ; 2 ; 3 ; 8 ; 8 ; 2 ; 6 ; 0 ; 13 ; 2 ; 2 ; 4 ; 9 ; 2 ; 7 ; 6 ; 1 ; 9 ; 7 ; 1 ; 5 ; 8 ; 6 ; 2 ; 11 ; 11 ; 8 ; 3 ; 4 ; 3 ; 0 ; 7 ; 1 ; 6 ; 13 ; 4 ; 4 ; 5 ; 5 ; 2 ; 4 ; 14 ; 5 ; 3 ; 2 ; 4 ; 4 ; 8 ; 2 ; 1 ; 2 ; 6 ; 4 ; 7 ; 6 ; 2 ; 2 ; 2 ; 7 ; 10 ; 5 ; 9 ; 3 ; 3 ; 3 ; 2 ; 3 ; 2 ; 9 ; 6 ; 0 ; 3 ; 5 ; 5 ; 9 ; 5 ; 10 ; 7 ; 3 ; 6 ; 4 ; 2 ; 2 ; 5 ; 5 ; 1 ; 1 ; 4 ; 0 ; 0 ; 5 ; 4 ; 4 ; 5 ; 7 ; 1 ; 4 ; 6 ; 6 ; 3 ; 5 ; 6 ; 7 ; 2 ; 4 ; 14 ; 14 ; 6 ; 4 ; 0 ; 6 ; 11 ; 8 ; 5 ; 16 ; 14 ; 0 ; 3 ; 4 ; 1 ; 4 ; 4 ; 0 ; 1 ; 5 ; 7 ; 10 ; 2 ; 6 ; 9 ; 1 ; 15 ; 2 ; 4 ; 1 ; 10 ; 5 ; 8 ; 4 ; 2 ; 6 ; 4 ; 3 ; 4 ; 7 ; 4 ; 6 ; 0 ; 11 ; 13 ; 6 ; 0 ; 14 ; 10 ; 4 ; 2 ; 5 ; 1 ; 1 ; 2 ; 7 ; 3 ; 8 ; 3 ; 2 ; 0 ; 12 ; 4 ; 3 ; 1 ; 6 ; 11 ; 7 ; 10 ; 10 ; 1 ; 1 ; 11 ; 4 ; 3 ; 1 ; 5 ; 4 ; 7 ; 5 ; 8 ; 1 ; 6 ; 7 ; 7 ; 1 ; 1 ; 2 ; 10 ; 0 ; 4 ; 7 ; 0 ; 5 ; 2 ; 2 ; 1 ; 2 ; 2 ; 3 ; 3 ; 7 ; 1 ; 2 ; 3 ; 3 ; 8 ; 3 ; 3 ; 3 ; 2 ; 14 ; 1 ; 3 ; 6 ; 4 ; 4 ; 1 ; 5 ; 3 ; 2 ; 2 ; 1 ; 2 ; 1 ; 4 ; 1 ; 8 ; 3 ; 0 ; 5 ; 7 ; 3 ; 4 ; 1 ; 3 ; 3 ; 2 ; 3 ; 2 ; 8 ; 7 ; 2 ; 6 ; 0 ; 13 ; 7 ; 6 ; 6 ; 7 ; 10 ; 3 ; 0 ; 1 ; 3 ; 3 ; 9 ; 4 ; 3 ; 12 ; 5 ; 4 ; 1 ; 6 ; 3 ; 9 ; 1 ; 11 ; 3 ; 10 ; 3 ; 2 ; 4 ; 0 ; 5 ; 3 ; 3 ; 10 ; 3 ; 5 ; 6 ; 7 ; 0 ; 4 ; 11 ; 11 ; 8 ; 4 ; 0 ; 2 ; 0 ; 7 ; 5 ; 0 ; 15 ; 7 ; 1 ; 6 ; 5 ; 3 ; 4 ; 3 ; 1 ; 7 ; 2 ; 12 ; 0 ; 4 ; 8 ; 9 ; 12 ; 13 ; 3 ; 2 ; 7 ; 3 ; 3 ; 15 ; 12 ; 5 ; 6 ; 1 ; 6 ; 4 ; 1 ; 4 ; 4 ; 2 ; 1 ; 6 ; 3 ; 8 ; 7 ; 1 ; 0 ; 8 ; 5 ; 6 ; 9 ; 3 ; 0 ; 5 ; 5 ; 8 ; 6 ; 2 ; 3 ; 4 ; 2 ; 6 ; 5 ; 0 ; 1 ; 0 ; 2 ; 3 ; 1 ; 7 ; 3 ; 2 ; 5 ; 0 ; 2 ; 8 ; 8 ; 2 ; 11 ; 14 ; 3 ; 4 ; 0 ; 4 ; 0 ; 3 ; 2 ; 2 ; 2 ; 4 ; 4 ; 3 ; 3 ; 5 ; 4 ; 11 ; 2 ; 8 ; 0 ; 3 ; 3 ; 9 ; 1 ; 4 ; 6 ; 4 ; 1 ; 8 ; 0 ; 1 ; 2 ; 5 ; 2 ; 3 ; 8 ; 2 ; 6 ; 6 ; 8 ; 4 ; 1 ; 5 ; 1 ; 7 ; 4 ; 7 ; 0 ; 1 ; 6 ; 5 ; 2 ; 2 ; 2 ; 1 ; 4 ; 0 ; 2 ; 1 ; 4 ; 5 ; 4 ; 0 ; 3 ; 0 ; 2 ; 5 ; 2 ; 5 ; 7 ; 4 ; 13 ; 4 ; 1 ; 5 ; 3 ; 4 ; 2 ; 12 ; 0 ; 0 ; 1 ; 3 ; 3 ; 2 ; 4 ; 3 ; 2 ; 1 ; 7 ; 6 ; 7 ; 4 ; 6 ; 5 ; 4 ; 3 ; 6 ; 3 ; 4 ; 4 ; 10 ; 2 ; 0 ; 5 ; 10 ; 1 ; 7 ; 4 ; 5 ; 10 ; 9 ; 1 ; 2 ; 0 ; 1"
oppScore <- "7 ; 6 ; 3 ; 3 ; 6 ; 7 ; 8 ; 6 ; 7 ; 3 ; 3 ; 6 ; 7 ; 8 ; 1 ; 1 ; 6 ; 10 ; 14 ; 3 ; 6 ; 1 ; 3 ; 2 ; 2 ; 4 ; 12 ; 0 ; 1 ; 4 ; 4 ; 9 ; 2 ; 2 ; 5 ; 3 ; 2 ; 5 ; 6 ; 4 ; 0 ; 5 ; 9 ; 2 ; 3 ; 6 ; 7 ; 8 ; 2 ; 4 ; 6 ; 4 ; 3 ; 6 ; 1 ; 9 ; 2 ; 0 ; 5 ; 7 ; 16 ; 4 ; 0 ; 3 ; 3 ; 4 ; 3 ; 3 ; 7 ; 5 ; 14 ; 5 ; 11 ; 4 ; 0 ; 4 ; 9 ; 1 ; 6 ; 6 ; 4 ; 6 ; 9 ; 0 ; 16 ; 3 ; 7 ; 1 ; 1 ; 1 ; 5 ; 2 ; 2 ; 5 ; 3 ; 5 ; 11 ; 9 ; 0 ; 5 ; 1 ; 7 ; 0 ; 3 ; 5 ; 9 ; 1 ; 7 ; 10 ; 7 ; 6 ; 8 ; 3 ; 4 ; 4 ; 3 ; 5 ; 5 ; 5 ; 9 ; 2 ; 4 ; 4 ; 3 ; 0 ; 6 ; 4 ; 1 ; 8 ; 3 ; 9 ; 5 ; 4 ; 9 ; 2 ; 3 ; 7 ; 3 ; 4 ; 2 ; 1 ; 6 ; 4 ; 15 ; 3 ; 0 ; 5 ; 5 ; 4 ; 10 ; 7 ; 11 ; 6 ; 5 ; 9 ; 3 ; 4 ; 8 ; 6 ; 9 ; 7 ; 6 ; 2 ; 5 ; 4 ; 1 ; 18 ; 6 ; 6 ; 15 ; 6 ; 5 ; 4 ; 6 ; 8 ; 9 ; 8 ; 5 ; 1 ; 1 ; 1 ; 0 ; 7 ; 2 ; 4 ; 4 ; 3 ; 4 ; 7 ; 8 ; 2 ; 0 ; 7 ; 4 ; 4 ; 5 ; 5 ; 5 ; 4 ; 4 ; 4 ; 9 ; 1 ; 4 ; 10 ; 6 ; 5 ; 7 ; 1 ; 7 ; 1 ; 1 ; 6 ; 7 ; 15 ; 3 ; 1 ; 7 ; 5 ; 6 ; 6 ; 4 ; 2 ; 6 ; 10 ; 5 ; 7 ; 14 ; 3 ; 10 ; 6 ; 1 ; 7 ; 9 ; 4 ; 3 ; 5 ; 2 ; 4 ; 9 ; 1 ; 5 ; 4 ; 1 ; 8 ; 3 ; 1 ; 0 ; 2 ; 3 ; 4 ; 5 ; 6 ; 13 ; 5 ; 2 ; 3 ; 4 ; 1 ; 5 ; 6 ; 15 ; 5 ; 0 ; 3 ; 12 ; 12 ; 1 ; 4 ; 5 ; 3 ; 3 ; 5 ; 3 ; 3 ; 9 ; 2 ; 5 ; 1 ; 1 ; 6 ; 4 ; 3 ; 4 ; 5 ; 6 ; 4 ; 1 ; 1 ; 2 ; 2 ; 5 ; 7 ; 3 ; 2 ; 5 ; 2 ; 2 ; 4 ; 7 ; 7 ; 2 ; 0 ; 10 ; 1 ; 9 ; 2 ; 3 ; 3 ; 3 ; 2 ; 6 ; 3 ; 1 ; 4 ; 4 ; 1 ; 2 ; 3 ; 5 ; 1 ; 3 ; 4 ; 2 ; 6 ; 7 ; 4 ; 1 ; 10 ; 2 ; 8 ; 2 ; 5 ; 7 ; 9 ; 1 ; 14 ; 4 ; 2 ; 6 ; 1 ; 3 ; 3 ; 3 ; 3 ; 1 ; 6 ; 6 ; 7 ; 6 ; 7 ; 0 ; 3 ; 2 ; 1 ; 0 ; 2 ; 3 ; 3 ; 3 ; 0 ; 1 ; 1 ; 2 ; 2 ; 16 ; 2 ; 7 ; 7 ; 4 ; 8 ; 4 ; 3 ; 4 ; 1 ; 0 ; 14 ; 4 ; 6 ; 3 ; 6 ; 8 ; 4 ; 5 ; 7 ; 8 ; 4 ; 4 ; 8 ; 2 ; 5 ; 7 ; 8 ; 8 ; 3 ; 3 ; 4 ; 4 ; 10 ; 6 ; 3 ; 1 ; 3 ; 4 ; 9 ; 4 ; 7 ; 10 ; 2 ; 4 ; 8 ; 1 ; 5 ; 11 ; 1 ; 3 ; 4 ; 8 ; 5 ; 0 ; 5 ; 3 ; 2 ; 3 ; 12 ; 3 ; 1 ; 0 ; 3 ; 2 ; 8 ; 2 ; 2 ; 4 ; 13 ; 5 ; 13 ; 4 ; 4 ; 6 ; 1 ; 8 ; 12 ; 1 ; 4 ; 3 ; 3 ; 5 ; 3 ; 8 ; 1 ; 5 ; 1 ; 10 ; 9 ; 9 ; 2 ; 5 ; 7 ; 4 ; 1 ; 10 ; 10 ; 8 ; 1 ; 4 ; 1 ; 2 ; 6 ; 8 ; 3 ; 3 ; 13 ; 5 ; 2 ; 2 ; 4 ; 5 ; 4 ; 7 ; 5 ; 6 ; 4 ; 0 ; 0 ; 0 ; 4 ; 3 ; 6 ; 5 ; 3 ; 8 ; 12 ; 1 ; 0 ; 5 ; 12 ; 3 ; 2 ; 7 ; 5 ; 11 ; 6 ; 5 ; 0 ; 3 ; 1 ; 0 ; 3 ; 0 ; 2 ; 4 ; 1 ; 2 ; 11 ; 8 ; 2 ; 8 ; 11 ; 5 ; 2 ; 1 ; 6 ; 3 ; 6 ; 3 ; 9 ; 2 ; 1 ; 5 ; 6 ; 6 ; 1 ; 5 ; 4 ; 3 ; 2 ; 3 ; 3 ; 5 ; 7 ; 1 ; 5 ; 1 ; 6 ; 10 ; 1 ; 7 ; 1 ; 3 ; 5 ; 5 ; 6 ; 4 ; 5 ; 4 ; 3 ; 1 ; 6 ; 1 ; 8 ; 3 ; 4 ; 1 ; 5 ; 5 ; 8 ; 9 ; 12 ; 5 ; 3 ; 8 ; 1 ; 5 ; 3 ; 2 ; 3 ; 0 ; 0 ; 4 ; 5 ; 2 ; 7 ; 9 ; 4 ; 0 ; 5 ; 3 ; 2 ; 2 ; 1 ; 3 ; 3 ; 3 ; 4 ; 6 ; 3 ; 1 ; 4 ; 1 ; 4 ; 0 ; 2 ; 3 ; 6 ; 2 ; 5 ; 2 ; 2 ; 5 ; 4 ; 1 ; 9 ; 5 ; 0 ; 10 ; 6 ; 0 ; 3 ; 2 ; 3 ; 6 ; 3 ; 5 ; 4 ; 5 ; 5 ; 3 ; 1 ; 9 ; 1 ; 4 ; 5 ; 2 ; 0 ; 1 ; 0 ; 11 ; 7 ; 7 ; 6 ; 9 ; 9 ; 6 ; 4 ; 6 ; 7 ; 4 ; 3 ; 10 ; 13 ; 2 ; 7 ; 3 ; 5 ; 2 ; 6 ; 3 ; 3 ; 0 ; 4 ; 5 ; 6 ; 4 ; 2 ; 3 ; 6 ; 5 ; 1 ; 6 ; 4 ; 5 ; 2 ; 4 ; 5 ; 4 ; 1 ; 2 ; 3 ; 3 ; 4 ; 1 ; 0 ; 3 ; 1 ; 6 ; 3 ; 3 ; 1 ; 5 ; 3 ; 9 ; 1 ; 5 ; 10 ; 6 ; 2 ; 5 ; 2 ; 5 ; 1 ; 7 ; 5 ; 3 ; 6 ; 1 ; 4 ; 6 ; 10 ; 5 ; 20 ; 7 ; 6 ; 4 ; 3 ; 2 ; 5 ; 2 ; 5 ; 2 ; 5 ; 13 ; 7 ; 9 ; 4 ; 6 ; 10 ; 4 ; 14 ; 2 ; 4 ; 4 ; 4 ; 5 ; 0 ; 2 ; 3 ; 3 ; 9 ; 1 ; 1 ; 7 ; 5 ; 4 ; 5 ; 2 ; 3 ; 2 ; 5 ; 1 ; 6 ; 3 ; 2 ; 4 ; 2 ; 4 ; 1 ; 0 ; 8 ; 8 ; 1 ; 5 ; 2 ; 4 ; 6 ; 4 ; 6 ; 10 ; 7 ; 2 ; 9 ; 3 ; 11 ; 8 ; 4 ; 7 ; 2 ; 3 ; 3 ; 6 ; 3 ; 0 ; 2 ; 10 ; 5 ; 5 ; 9 ; 3 ; 4 ; 2 ; 4 ; 2 ; 0 ; 3 ; 1 ; 2 ; 2 ; 1 ; 8 ; 8 ; 9 ; 4 ; 0 ; 3 ; 4 ; 8 ; 5 ; 3 ; 6 ; 7 ; 2 ; 2 ; 3 ; 4 ; 2 ; 7 ; 3 ; 2 ; 4 ; 1 ; 1 ; 6 ; 7 ; 8 ; 3 ; 2 ; 8 ; 4 ; 4 ; 1 ; 6 ; 8 ; 6 ; 3 ; 3 ; 5 ; 7 ; 6 ; 8 ; 3 ; 4 ; 0 ; 6 ; 4 ; 4 ; 2 ; 6 ; 12 ; 8 ; 4 ; 6 ; 1 ; 5 ; 3 ; 3 ; 0 ; 1 ; 7 ; 6 ; 8 ; 6 ; 3 ; 2 ; 3 ; 1 ; 5 ; 2 ; 5 ; 1 ; 2 ; 4 ; 3 ; 7 ; 5 ; 4 ; 7 ; 0 ; 4 ; 4 ; 5 ; 5 ; 3 ; 2 ; 7 ; 4 ; 4 ; 9 ; 3 ; 3 ; 7 ; 2 ; 0 ; 4 ; 2 ; 5 ; 4 ; 14 ; 0 ; 7 ; 2 ; 5 ; 5 ; 18 ; 7 ; 7 ; 3 ; 4 ; 9 ; 0 ; 1 ; 2 ; 2 ; 5 ; 7 ; 2 ; 6 ; 1 ; 7 ; 8 ; 4 ; 1 ; 5 ; 6 ; 5 ; 2 ; 3 ; 7 ; 2 ; 3 ; 4 ; 3 ; 1 ; 3 ; 11 ; 6 ; 1 ; 3 ; 11 ; 7 ; 8 ; 4 ; 5 ; 13 ; 1 ; 2 ; 2 ; 7 ; 2 ; 5 ; 14 ; 4 ; 5 ; 0 ; 4 ; 1 ; 5 ; 8 ; 4 ; 0 ; 2 ; 6 ; 1 ; 6 ; 6 ; 3 ; 1 ; 4 ; 5 ; 4 ; 8 ; 2 ; 3"
rsHome <- "1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0"
rsSeas1 <- "2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015"
rsSeas2 <- "2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015"
rsSeas <- paste(rsSeas1, rsSeas2, sep=" ; ")
redsox <- data.frame(date=strsplit(rsDate, " ; ")[[1]],
boston_score=as.integer(strsplit(rsScore, " ; ")[[1]]),
opponent_score=as.integer(strsplit(oppScore, " ; ")[[1]]),
homegame=as.numeric(strsplit(rsHome, " ; ")[[1]]),
mlb=1, nfl=0, nhl=0, nba=0,
season=as.numeric(strsplit(rsSeas, " ; ")[[1]]),
stringsAsFactors=FALSE
)
# View summary information about your redsox data
summary(redsox)
## date boston_score opponent_score homegame
## Length:972 Min. : 0.000 Min. : 0.000 Min. :0.0
## Class :character 1st Qu.: 2.000 1st Qu.: 2.000 1st Qu.:0.0
## Mode :character Median : 4.000 Median : 4.000 Median :0.5
## Mean : 4.796 Mean : 4.538 Mean :0.5
## 3rd Qu.: 7.000 3rd Qu.: 6.000 3rd Qu.:1.0
## Max. :22.000 Max. :20.000 Max. :1.0
## mlb nfl nhl nba season
## Min. :1 Min. :0 Min. :0 Min. :0 Min. :2010
## 1st Qu.:1 1st Qu.:0 1st Qu.:0 1st Qu.:0 1st Qu.:2011
## Median :1 Median :0 Median :0 Median :0 Median :2012
## Mean :1 Mean :0 Mean :0 Mean :0 Mean :2012
## 3rd Qu.:1 3rd Qu.:0 3rd Qu.:0 3rd Qu.:0 3rd Qu.:2014
## Max. :1 Max. :0 Max. :0 Max. :0 Max. :2015
# Convert the date column to a time-based format
redsox$date<- as.Date(redsox$date)
# Convert your red sox data to xts
redsox_xts <- as.xts(redsox[,-1], order.by = redsox$date)
# Plot the Red Sox score and the opponent score over time
plot.zoo(redsox_xts[, c("boston_score", "opponent_score")])
# Generate a new variable coding for red sox wins
redsox_xts$win_loss <- ifelse(redsox_xts$boston_score > redsox_xts$opponent_score, 1, 0)
# Identify the date of the last game each season
close <- endpoints(redsox_xts, on = "years")
# Calculate average win/loss record at the end of each season
period.apply(redsox_xts[, "win_loss"], INDEX=close, FUN=mean)
## win_loss
## 2010-10-03 0.5493827
## 2011-09-28 0.5555556
## 2012-10-03 0.4259259
## 2013-09-29 0.5987654
## 2014-09-28 0.4382716
## 2015-10-04 0.4814815
# Split redsox_xts win_loss data into years
redsox_seasons <- split(redsox_xts$win_loss, f = "years")
# Use lapply to calculate the cumulative mean for each season
redsox_ytd <- lapply(redsox_seasons, cummean)
# Use do.call to rbind the results
redsox_winloss <- do.call(rbind, redsox_ytd)
# Plot the win_loss average for the 2013 season
plot.xts( as.xts(as.vector(t(redsox_winloss)), order.by=index(redsox_xts))["2013"], ylim = c(0, 1))
# Select only the 2013 season
redsox_2013 <- redsox_xts["2013"]
# Use rollapply to generate the last ten average
lastten_2013 <- rollapply(redsox_2013$win_loss, width = 10, FUN = mean)
# Plot the last ten average during the 2013 season
plot.xts(lastten_2013, ylim = c(0, 1))
### *** dataset "sports" is not available; need to comment out
# Extract the day of the week of each observation
# weekday <- .indexwday(sports)
# head(weekday)
# Generate an index of weekend dates
# weekend <- which(.indexwday(sports) == 0 | .indexwday(sports) == 6)
# Subset only weekend games
# weekend_games <- sports[weekend]
# head(weekend_games)
# Generate a subset of sports data with only homegames
# homegames <- sports[sports$homegame == 1]
# Calculate the win/loss average of the last 20 home games
# homegames$win_loss_20 <- rollapply(homegames$win_loss, width = 20, FUN = mean)
# Calculate the win/loss average of the last 100 home games
# homegames$win_loss_100 <- rollapply(homegames$win_loss, width = 100, FUN = mean)
# Use plot.xts to generate
# plot.zoo(homegames[, c("win_loss_20", "win_loss_100")], plot.type = "single", lty = lty, lwd = lwd)
Chapter 1 - Exploring pitch velocities
Zach Greinke 2015 season - the dominant month of July 2015:
Subsets and histograms - the start_speed is the MPH numeric for the pitch when it leaves the pitcher’s hand:
Using tapply() for comparisons:
Example code includes:
# Problem - I do not have the greinke dataset!
# Print the first 6 rows of the data
# head(greinke)
# Print the number of rows in the data frame
# nrow(greinke)
# Summarize the start_speed variable
# summary(greinke$start_speed)
# Get rid of data without start_speed
# greinke <- subset(greinke, !is.na(start_speed))
# Print the number of complete entries
# nrow(greinke)
# Print the structure of greinke
# str(greinke)
# Check if dates are formatted as dates
# class(greinke$game_date)
# Change them to dates
# greinke$game_date <- as.Date(greinke$game_date, format="%m/%d/%Y")
# Check that the variable is now formatted as a date
# class(greinke$game_date)
# Separate game_date into "year", "month", and "day"
# greinke <- separate(data = greinke, col = game_date,
# into = c("year", "month", "day"),
# sep = "-", remove = FALSE)
# Convert month to numeric
# greinke$month <- as.numeric(greinke$month)
# Create the july variable
# greinke$july <- ifelse(greinke$month == 7, "july", "other")
# View the head() of greinke
# head(greinke)
# Print a summary of the july variable
# summary(factor(greinke$july))
# Make a histogram of Greinke's start speed
# hist(greinke$start_speed)
# Create greinke_july
# greinke_july <- subset(greinke, july == "july")
# Create greinke_other
# greinke_other <- subset(greinke, july == "other")
# Use par to format your plot layout
# par(mfrow = c(1, 2))
# Plot start_speed histogram from july
# hist(greinke_july$start_speed)
# Plot start_speed histogram for other months
# hist(greinke_other$start_speed)
# Create july_ff
# july_ff <- subset(greinke_july, pitch_type == "FF")
# Create other_ff
# other_ff <- subset(greinke_other, pitch_type == "FF")
# Formatting code, don't change this
# par(mfrow = c(1, 2))
# Plot histogram of July fastball speeds
# hist(july_ff$start_speed)
# Plot histogram of other month fastball speeds
# hist(other_ff$start_speed)
# Make a fastball speed histogram for other months
# hist(other_ff$start_speed,
# col = "#00009950", freq = FALSE,
# ylim = c(0, .35), xlab = "Velocity (mph)",
# main = "Greinke 4-Seam Fastball Velocity")
# Add a histogram for July
# hist(july_ff$start_speed,
# col = "#99000050", freq = FALSE,
# add=TRUE)
# Draw vertical line at the mean of other_ff
# abline(v=mean(other_ff$start_speed), col="#00009950", lwd=2)
# Draw vertical line at the mean of july_ff
# abline(v=mean(july_ff$start_speed), col="#99000050", lwd=2)
# Summarize velocity in July and other months
# tapply(greinke$start_speed, greinke$july, FUN=mean)
# Create greinke_ff
# greinke_ff <- subset(greinke, pitch_type == "FF")
# Calculate mean fastball velocities: ff_velo_month
# ff_velo_month <- tapply(greinke_ff$start_speed, greinke_ff$july, FUN=mean)
# Print ff_velo_month
# ff_velo_month
# Create ff_dt
# ff_dt <- data.frame(tapply(greinke_ff$start_speed, greinke_ff$game_date, FUN=mean))
# Print the first 6 rows of ff_dt
# head(ff_dt)
# Create game_date in ff_dt
# ff_dt$game_date <- as.Date(row.names(ff_dt), format="%Y-%m-%d")
# Rename the first column
# colnames(ff_dt)[1] <- "start_speed"
# Remove row names
# row.names(ff_dt) <- NULL
# View head of ff_dt
# head(ff_dt)
# Plot game-by-game 4-seam fastballs
# plot(ff_dt$start_speed ~ ff_dt$game_date,
# lwd = 4, type = "l", ylim = c(88, 95),
# main = "Greinke 4-Seam Fastball Velocity",
# xlab = "Date", ylab = "Velocity (mph)"
# )
# Code from last exercise, don't change this
# plot(ff_dt$start_speed ~ ff_dt$game_date,
# lwd = 4, type = "l", ylim = c(88, 95),
# main = "Greinke 4-Seam Fastball Velocity",
# xlab = "Date", ylab = "Velocity (mph)")
# Add jittered points to the plot
# points(greinke_ff$start_speed ~ jitter(as.numeric(greinke_ff$game_date)),
# pch=16, col = "#99004450"
# )
cat("\n\nCould not run - do not have dataset 'greinke' or anything that would serve as an analog\n\n")
##
##
## Could not run - do not have dataset 'greinke' or anything that would serve as an analog
Chapter 2 - Exploring pitch types
Pitch mix - did the pitch mix change in July:
Ball-strike count and pitch usage:
Example code includes:
# DO NOT HAVE THE FULL DATA
# Subset the data to remove pitch types "IN" and "EP"
# greinke <- subset(greinke, pitch_type != "IN" & pitch_type != "EP")
# Drop the levels from pitch_type
# droplevels(greinke$pitch_type)
# Create type_tab
# type_tab <- table(greinke$pitch_type, greinke$july)
# Print type_tab
# type_tab
# Create type_tab
myFreq <- c(112, 51, 207, 66, 86, 487, 242, 1191, 255, 535)
myType <- rep(rep(c("CH", "CU", "FF", "FT", "SL"), times=2), times=myFreq)
myJuly <- rep(rep(c("july", "other"), each=5), times=myFreq)
type_tab <- table(myType, myJuly)
type_tab
## myJuly
## myType july other
## CH 112 487
## CU 51 242
## FF 207 1191
## FT 66 255
## SL 86 535
# Create type_prop table
type_prop <- round(prop.table(type_tab, margin=2), 3)
# Print type_prop
type_prop
## myJuly
## myType july other
## CH 0.215 0.180
## CU 0.098 0.089
## FF 0.397 0.439
## FT 0.126 0.094
## SL 0.165 0.197
# Create type_prop table
type_prop <- round(prop.table(type_tab, margin=2), 3)
# Print type_prop
type_prop
## myJuly
## myType july other
## CH 0.215 0.180
## CU 0.098 0.089
## FF 0.397 0.439
## FT 0.126 0.094
## SL 0.165 0.197
# Create ff_prop
ff_prop <- type_prop[row.names(type_prop) == "FF", ]
# Print ff_prop
ff_prop
## july other
## 0.397 0.439
# Print ff_velo_month
ff_velo_month <- data.frame(start_speed=c(92.4, 91.7), row.names=c("july", "other"))
ff_velo_month
## start_speed
## july 92.4
## other 91.7
# Change up the type_prop data
tProp <- type_prop
type_prop <- data.frame(Pitch=names(tProp[,1]), July=tProp[,1], Other=tProp[,2], row.names=NULL)
type_prop
## Pitch July Other
## 1 CH 0.215 0.180
## 2 CU 0.098 0.089
## 3 FF 0.397 0.439
## 4 FT 0.126 0.094
## 5 SL 0.165 0.197
# Create the Difference column
type_prop$Difference <- (type_prop$July - type_prop$Other)/type_prop$Other
# Print type_prop
type_prop
## Pitch July Other Difference
## 1 CH 0.215 0.180 0.19444444
## 2 CU 0.098 0.089 0.10112360
## 3 FF 0.397 0.439 -0.09567198
## 4 FT 0.126 0.094 0.34042553
## 5 SL 0.165 0.197 -0.16243655
# Plot a barplot
barplot(type_prop$Difference, names.arg = type_prop$Pitch,
main = "Pitch Usage in July vs. Other Months",
ylab = "Percentage Change in July",
ylim = c(-0.3, 0.3))
# Create bs_table
bsBalls <- rep(rep(0:3, times=3),
times=c(845, 307, 84, 19, 435, 371, 171, 50, 201, 310, 300, 139)
)
bsStrikes <- rep(rep(0:2, each=4),
times=c(845, 307, 84, 19, 435, 371, 171, 50, 201, 310, 300, 139)
)
bs_table <- table(bsBalls, bsStrikes)
bs_table
## bsStrikes
## bsBalls 0 1 2
## 0 845 435 201
## 1 307 371 310
## 2 84 171 300
## 3 19 50 139
# Create bs_table (this would be if the data were available - see above)
# bs_table <- table(greinke$balls, greinke$strikes)
# Create bs_prop_table
bs_prop_table <- round(prop.table(bs_table), 3)
# Print bs_prop_table
bs_prop_table
## bsStrikes
## bsBalls 0 1 2
## 0 0.261 0.135 0.062
## 1 0.095 0.115 0.096
## 2 0.026 0.053 0.093
## 3 0.006 0.015 0.043
# Print row sums
rowSums(bs_prop_table)
## 0 1 2 3
## 0.458 0.306 0.172 0.064
# Print column sums
colSums(bs_prop_table)
## 0 1 2
## 0.388 0.318 0.294
# DO NOT HAVE THIS DATA
# Create bs_count
# greinke$bs_count <- paste(greinke$balls, greinke$strikes, sep="-")
# Print the first 6 rows of greinke
# head(greinke)
# Create the bs_count_tab data file
bsFreq <- as.numeric(strsplit("136 ; 70 ; 29 ; 55 ; 64 ; 48 ; 15 ; 27 ; 45 ; 3 ; 8 ; 22 ; 709 ; 365 ; 172 ; 252 ; 307 ; 262 ; 69 ; 144 ; 255 ; 16 ; 42 ; 117", " ; ")[[1]])
bsCounts <- rep(strsplit("0-0 ; 0-1 ; 0-2 ; 1-0 ; 1-1 ; 1-2 ; 2-0 ; 2-1 ; 2-2 ; 3-0 ; 3-1 ; 3-2 ; 0-0 ; 0-1 ; 0-2 ; 1-0 ; 1-1 ; 1-2 ; 2-0 ; 2-1 ; 2-2 ; 3-0 ; 3-1 ; 3-2", " ; ")[[1]], times=bsFreq)
bsTypes <- rep(strsplit("july ; july ; july ; july ; july ; july ; july ; july ; july ; july ; july ; july ; other ; other ; other ; other ; other ; other ; other ; other ; other ; other ; other ; other", " ; ")[[1]], times=bsFreq)
bs_count_tab <- table(bsCounts, bsTypes)
bs_count_tab
## bsTypes
## bsCounts july other
## 0-0 136 709
## 0-1 70 365
## 0-2 29 172
## 1-0 55 252
## 1-1 64 307
## 1-2 48 262
## 2-0 15 69
## 2-1 27 144
## 2-2 45 255
## 3-0 3 16
## 3-1 8 42
## 3-2 22 117
# Create bs_count_tab (if raw data were actually available - see above)
# bs_count_tab <- table(greinke$bs_count, greinke$july)
# Create bs_month
bs_month <- round(prop.table(bs_count_tab, margin=2), 3)
# Print bs_month
bs_month
## bsTypes
## bsCounts july other
## 0-0 0.261 0.262
## 0-1 0.134 0.135
## 0-2 0.056 0.063
## 1-0 0.105 0.093
## 1-1 0.123 0.113
## 1-2 0.092 0.097
## 2-0 0.029 0.025
## 2-1 0.052 0.053
## 2-2 0.086 0.094
## 3-0 0.006 0.006
## 3-1 0.015 0.015
## 3-2 0.042 0.043
# Create diff_bs
diff_bs <- round((bs_month[, 2] - bs_month[, 1]) / bs_month[, 2], 3)
# Print diff_bs
diff_bs
## 0-0 0-1 0-2 1-0 1-1 1-2 2-0 2-1 2-2 3-0
## 0.004 0.007 0.111 -0.129 -0.088 0.052 -0.160 0.019 0.085 0.000
## 3-1 3-2
## 0.000 0.023
# Create a bar plot of the changes
barplot(diff_bs, main = "Ball-Strike Count Rate in July vs. Other Months",
ylab = "Percentage Change in July", ylim = c(-0.15, 0.15), las = 2)
# Create type_bs
typeFreq <- as.numeric(strsplit("92 ; 124 ; 482 ; 54 ; 93 ; 93 ; 49 ; 167 ; 55 ; 71 ; 36 ; 10 ; 61 ; 19 ; 75 ; 70 ; 34 ; 136 ; 32 ; 35 ; 79 ; 38 ; 136 ; 50 ; 68 ; 62 ; 9 ; 89 ; 31 ; 119 ; 27 ; 4 ; 37 ; 11 ; 5 ; 46 ; 12 ; 71 ; 18 ; 24 ; 52 ; 9 ; 109 ; 34 ; 96 ; 0 ; 0 ; 17 ; 2 ; 0 ; 18 ; 0 ; 24 ; 3 ; 5 ; 24 ; 4 ; 69 ; 12 ; 30", " ; ")[[1]])
typeCount <- rep(rep(row.names(bs_count_tab), each=5), times=typeFreq)
typePitch <- rep(rep(row.names(type_tab), times=12), times=typeFreq)
type_bs <- table(typePitch, typeCount)
type_bs
## typeCount
## typePitch 0-0 0-1 0-2 1-0 1-1 1-2 2-0 2-1 2-2 3-0 3-1 3-2
## CH 92 93 36 70 79 62 27 46 52 0 18 24
## CU 124 49 10 34 38 9 4 12 9 0 0 4
## FF 482 167 61 136 136 89 37 71 109 17 24 69
## FT 54 55 19 32 50 31 11 18 34 2 3 12
## SL 93 71 75 35 68 119 5 24 96 0 5 30
# Create type_bs (if greinke data were available; see above)
# type_bs <- table(greinke$pitch_type, greinke$bs_count)
# Print type_bs
type_bs
## typeCount
## typePitch 0-0 0-1 0-2 1-0 1-1 1-2 2-0 2-1 2-2 3-0 3-1 3-2
## CH 92 93 36 70 79 62 27 46 52 0 18 24
## CU 124 49 10 34 38 9 4 12 9 0 0 4
## FF 482 167 61 136 136 89 37 71 109 17 24 69
## FT 54 55 19 32 50 31 11 18 34 2 3 12
## SL 93 71 75 35 68 119 5 24 96 0 5 30
# Create type_bs_prop
type_bs_prop <- round(prop.table(type_bs, margin=2), 3)
# Print type_bs_prop
type_bs_prop
## typeCount
## typePitch 0-0 0-1 0-2 1-0 1-1 1-2 2-0 2-1 2-2 3-0
## CH 0.109 0.214 0.179 0.228 0.213 0.200 0.321 0.269 0.173 0.000
## CU 0.147 0.113 0.050 0.111 0.102 0.029 0.048 0.070 0.030 0.000
## FF 0.570 0.384 0.303 0.443 0.367 0.287 0.440 0.415 0.363 0.895
## FT 0.064 0.126 0.095 0.104 0.135 0.100 0.131 0.105 0.113 0.105
## SL 0.110 0.163 0.373 0.114 0.183 0.384 0.060 0.140 0.320 0.000
## typeCount
## typePitch 3-1 3-2
## CH 0.360 0.173
## CU 0.000 0.029
## FF 0.480 0.496
## FT 0.060 0.086
## SL 0.100 0.216
# Create type_late
lateData <- rep(rep(0:1, each=5),
times=c(416, 201, 1036, 249, 431, 183, 92, 362, 72, 190)
)
pitchData <- rep(rep(row.names(type_tab), times=2),
times=c(416, 201, 1036, 249, 431, 183, 92, 362, 72, 190)
)
type_late <- table(pitchData, lateData)
type_late
## lateData
## pitchData 0 1
## CH 416 183
## CU 201 92
## FF 1036 362
## FT 249 72
## SL 431 190
# Create the late_in_game column (if had the greinke data; see above)
# greinke$late_in_game <- ifelse(greinke$inning > 5, 1, 0)
# Convert late_in_game (if had the greinke data; see above)
# greinke$late_in_game <- factor(greinke$late_in_game)
# Create type_late (if had the greinke data; see above)
# type_late <- table(greinke$pitch_type, greinke$late_in_game)
# Create type_late_prop
type_late_prop <- round(prop.table(type_late, margin=2), 3)
# Print type_late_prop
type_late_prop
## lateData
## pitchData 0 1
## CH 0.178 0.204
## CU 0.086 0.102
## FF 0.444 0.403
## FT 0.107 0.080
## SL 0.185 0.211
# Create t_type_late
t_type_late <- t(type_late_prop)
# Print dimensions of t_type_late
dim(t_type_late)
## [1] 2 5
# Print dimensions of type_late
dim(type_late_prop)
## [1] 5 2
# Change row names
rownames(t_type_late) <- c("Early", "Late")
# Make barplot using t_type_late
barplot(t_type_late, beside = TRUE, col = c("red", "blue"),
main = "Early vs. Late In Game Pitch Selection",
ylab = "Pitch Selection Proportion",
legend = rownames(t_type_late))
Chapter 3 - Exploring pitch locations
Pitch location and Greinke’s July - pitches lower and further from the plate are harder to hit, but pitches repeatedly in the same location are easier to hit:
For loop for plots - execute the code across all the zones:
Example code includes:
# DO NOT HAVE THIS DATA
# Calculate average pitch height in inches in July vs. other months
# tapply(greinke$pz, greinke$july, mean) * 12
# Create greinke_lhb
# greinke_lhb <- subset(greinke, batter_stand == "L")
# Create greinke_rhb
# greinke_rhb <- subset(greinke, batter_stand == "R")
# Compute average px location for LHB
# tapply(greinke_lhb$px, greinke_lhb$july, mean) * 12
# Compute average px location for RHB
# tapply(greinke_rhb$px, greinke_rhb$july, mean) * 12
# Plot location of all pitches
# plot(greinke$pz ~ greinke$px,
# col = factor(greinke$july),
# xlim = c(-3, 3))
# Formatting code, don't change this
# par(mfrow = c(1, 2))
# Plot the pitch loctions for July
# plot(pz ~ px, data = greinke_july,
# col = "red", pch = 16,
# xlim = c(-3, 3), ylim = c(-1, 6),
# main = "July")
# Plot the pitch locations for other months
# plot(pz ~ px, data = greinke_other,
# col = "black", pch = 16,
# xlim = c(-3, 3), ylim = c(-1, 6),
# main = "Other months")
# Create greinke_sub
# greinke_sub <- subset(greinke, px > -2 & px < 2 & pz > 0 & pz < 5)
# Plot pitch location window
# plot(x = c(-2, 2), y = c(0, 5), type = "n",
# main = "Greinke Locational Zone Proportions",
# xlab = "Horizontal Location (ft.; Catcher's View)",
# ylab = "Vertical Location (ft.)")
# Add the grid lines
# grid(lty = "solid", col = "black")
# Create greinke_table
# greinke_table <- table(greinke_sub$zone)
# Create zone_prop
# zone_prop <- round(prop.table(greinke_table), 3)
# Plot strike zone grid, don't change this
# plot_grid()
# Add text from zone_prop[1]
# text(zone_prop[1], x=-1.5, y=4.5, cex=1.5)
# Plot grid, don't change this
# plot_grid()
# Plot text using for loop
# for(i in 1:20) {
# text(mean(greinke_sub$zone_px[greinke_sub$zone == i]),
# mean(greinke_sub$zone_pz[greinke_sub$zone == i]),
# zone_prop[i], cex = 1.5)
# }
# Create zone_prop_july
# zone_prop_july <- round(
# table(greinke_sub$zone[greinke_sub$july == "july"]) /
# nrow(subset(greinke_sub, july == "july")), 3)
# Create zone_prop_other
# zone_prop_other <- round(
# table(greinke_sub$zone[greinke_sub$july == "other"]) /
# nrow(subset(greinke_sub, july == "other")), 3)
# Print zone_prop_july
# zone_prop_july
# Print zone_prop_other
# zone_prop_other
# Fix zone_prop_july vector, don't change this
# zone_prop_july2 <- c(zone_prop_july[1:3], 0.00, zone_prop_july[4:19])
# names(zone_prop_july2) <- c(1:20)
# Create zone_prop_diff
# zone_prop_diff <- zone_prop_july2 - zone_prop_other
# Print zone_prop_diff
# zone_prop_diff
# Plot grid, don't change this
# plot_grid()
# Create for loop
# for(i in 1:20) {
# text(mean(greinke_sub$zone_px[greinke_sub$zone == i]),
# mean(greinke_sub$zone_pz[greinke_sub$zone == i]),
# zone_prop_diff[i, ], cex = 1.5)
# }
# Create greinke_zone_tab
# greinke_zone_tab <- table(greinke_sub$zone, greinke_sub$bs_count)
# Create zone_count_prop
# zone_count_prop <- round(prop.table(greinke_zone_tab, margin=2), 3)
# Print zone_count_prop
# zone_count_prop
# Create zone_count_diff
# zone_count_diff <- zone_count_prop[, 3] - zone_count_prop[, 10]
# Print the table
# zone_count_diff
# Plot grid, don't change this
# plot(x = c(-2, 2), y = c(0, 5), type = "n",
# main = "Greinke Locational Zone (0-2 vs. 3-0 Counts)",
# xlab = "Horizontal Location (ft.; Catcher's View)",
# ylab = "Vertical Location (ft.)")
# grid(lty = "solid", col = "black")
# Add text to the figure for location differences
# for(i in 1:20) {
# text(mean(greinke_sub$zone_px[greinke_sub$zone == i]),
# mean(greinke_sub$zone_pz[greinke_sub$zone == i]),
# zone_count_diff[i, ], cex = 1.5)
# }
cat("\n\nDo not have the data to run the associated code\n\n")
##
##
## Do not have the data to run the associated code
Chapter 4 - Exploring batted ball outcomes
Batted ball outcomes - contact rates:
Using ggplot2 - reduce the labor to produce certain types of graphics:
Batted ball outcomes - exit velocity:
Example code includes:
# DO NOT HAVE THIS DATA . . .
# Create batter_swing
# no_swing <- c("Ball", "Called Strike", "Ball in Dirt", "Hit By Pitch")
# greinke_ff$batter_swing <- ifelse(greinke_ff$pitch_result %in% no_swing, 0, 1)
# Create swing_ff
# swing_ff <- subset(greinke_ff, batter_swing == 1)
# Create the contact variable
# no_contact <- c("Swinging Strike", "Missed Bunt")
# swing_ff$contact <- ifelse(swing_ff$pitch_result %in% no_contact, 0, 1)
# Create velo_bin: add one line for "Fast"
# swing_ff$velo_bin <- ifelse(swing_ff$start_speed < 90.5, "Slow", NA)
# swing_ff$velo_bin <- ifelse(swing_ff$start_speed >= 90.5 & swing_ff$start_speed < 92.5,
# "Medium", swing_ff$velo_bin)
#
# swing_ff$velo_bin <- ifelse(swing_ff$start_speed >= 92.5,
# "Fast", swing_ff$velo_bin)
# Aggregate contact rate by velocity bin
# tapply(swing_ff$contact, swing_ff$velo_bin, FUN=mean)
#
#
# bin_pitch_speed <- function(start_speed) {
# as.integer(cut(start_speed, quantile(start_speed, probs = 0:3 / 3), include.lowest = TRUE))
# }
#
#
# Create the subsets for each pitch type
# swing_ff <- subset(swings, pitch_type == "FF")
# swing_ch <- subset(swings, pitch_type == "CH")
# swing_cu <- subset(swings, pitch_type == "CU")
# swing_ft <- subset(swings, pitch_type == "FT")
# swing_sl <- subset(swings, pitch_type == "SL")
# Make velo_bin_pitch variable for each subset
# swing_ff$velo_bin <- bin_pitch_speed(swing_ff$start_speed)
# swing_ch$velo_bin <- bin_pitch_speed(swing_ch$start_speed)
# swing_cu$velo_bin <- bin_pitch_speed(swing_cu$start_speed)
# swing_ft$velo_bin <- bin_pitch_speed(swing_ft$start_speed)
# swing_sl$velo_bin <- bin_pitch_speed(swing_sl$start_speed)
# Print quantile levels for each pitch
# thirds <- c(0, 1/3, 2/3, 1)
# quantile(swing_ff$start_speed, probs = thirds)
# quantile(swing_ch$start_speed, probs = thirds)
# quantile(swing_cu$start_speed, probs = thirds)
# quantile(swing_ft$start_speed, probs = thirds)
# quantile(swing_sl$start_speed, probs = thirds)
# Calculate contact rate by velocity for swing_ff
# tapply(swing_ff$contact, swing_ff$velo_bin, FUN=mean)
# Calculate contact rate by velocity for swing_ft
# tapply(swing_ft$contact, swing_ft$velo_bin, FUN=mean)
# Calculate contact rate by velocity for swing_ch
# tapply(swing_ch$contact, swing_ch$velo_bin, FUN=mean)
# Calculate contact rate by velocity for swing_cu
# tapply(swing_cu$contact, swing_cu$velo_bin, FUN=mean)
# Calculate contact rate by velocity for swing_sl
# tapply(swing_sl$contact, swing_sl$velo_bin, FUN=mean)
# Create swings_str2
# swings_str2 <- subset(swings, strikes == 2)
# Print number of observations
# nrow(swings_str2)
# Print a table of pitch use
# table(swings_str2$pitch_type)
# Calculate contact rate by pitch type
# round(tapply(swings_str2$contact, swings_str2$pitch_type, FUN=mean), 3)
# Create subset of swings: swings_rhb
# swings_rhb <- subset(swings, batter_stand == "R")
# Create subset of swings: swings_lhb
# swings_lhb <- subset(swings, batter_stand == "L")
# Create zone_contact_r
# zone_contact_r <- round(tapply(swings_rhb$contact, swings_rhb$zone, FUN=mean), 3)
# Create zone_contact_l
# zone_contact_l <- round(tapply(swings_lhb$contact, swings_lhb$zone, FUN=mean), 3)
# Plot figure grid for RHB
# par(mfrow = c(1, 2))
# plot(x = c(-1, 1), y = c(1, 4), type = "n",
# main = "Contact Rate by Location (RHB)",
# xlab = "Horizontal Location (ft.; Catcher's View)",
# ylab = "Vertical Location (ft.)")
# abline(v = 0)
# abline(h = 2)
# abline(h = 3)
# Add text for RHB contact rate
# for(i in unique(c(6, 7, 10, 11, 14, 15))) {
# text(mean(swings_rhb$zone_px[swings_rhb$zone == i]),
# mean(swings_rhb$zone_pz[swings_rhb$zone == i]),
# zone_contact_r[rownames(zone_contact_r) == i], cex = 1.5)
# }
# Add LHB plot
# plot(x = c(-1, 1), y = c(1, 4), type = "n",
# main = "Contact Rate by Location (LHB)",
# xlab = "Horizontal Location (ft.; Catcher's View)",
# ylab = "Vertical Location (ft.)")
# abline(v = 0)
# abline(h = 2)
# abline(h = 3)
# Add text for LHB contact rate
# for(i in unique(c(6, 7, 10, 11, 14, 15))) {
# text(mean(swings_lhb$zone_px[swings_lhb$zone == i]),
# mean(swings_lhb$zone_pz[swings_lhb$zone == i]),
# zone_contact_l[rownames(zone_contact_l) == i], cex = 1.5)
# }
# Create vector px
# px <- rep(seq(-1.5, 1.5, by=1), times=5)
# Create vector pz
# pz <- rep(seq(4.5, 0.5, by=-1), each=4)
# Create vector of zone numbers
# zone <- seq(1, 20, by=1)
# Create locgrid
# locgrid <- data.frame(zone=zone, px=px, pz=pz)
# Print locgrid
# locgrid
# The gridExtra package is preloaded in your workspace
# Examine new contact data
# zone_contact_r
# zone_contact_l
# Merge locgrid with zone_contact_r
# locgrid <- merge(locgrid, zone_contact_r, by="zone", all.x=TRUE)
# Merge locgrid with zone_contact_l
# locgrid <- merge(locgrid, zone_contact_l, by="zone", all.x=TRUE)
# Print locgrid to the console
# locgrid
# Make base grid with ggplot()
# plot_base_grid <- ggplot(locgrid, aes(x=px, y=pz))
# Arrange the plots side-by-side
# grid.arrange(plot_base_grid, plot_base_grid, ncol=2)
# Make RHB plot
# plot_titles_rhb <- plot_base_grid +
# ggtitle("RHB Contact Rates") +
# labs(x = "Horizontal Location(ft.; Catcher's View)",
# y = "Vertical Location (ft.)") +
# theme(plot.title = element_text(size = 15))
# Make LHB plot
# plot_titles_lhb <- plot_base_grid +
# ggtitle("LHB Contact Rates") +
# labs(x = "Horizontal Location(ft.; Catcher's View)",
# y = "Vertical Location (ft.)") +
# theme(plot.title = element_text(size = 15))
# Display both side-by-side
# grid.arrange(plot_titles_rhb, plot_titles_lhb, ncol=2)
# Make RHB plot
# plot_colors_rhb <- plot_titles_rhb +
# geom_tile(aes(fill = contact_rate_r)) +
# scale_fill_gradientn(name = "Contact Rate",
# limits = c(0.5, 1),
# breaks = seq(from = 0.5, to = 1, by = 0.1),
# colors = c(brewer.pal(n = 7, name = "Reds")))
# Make LHB plot
# plot_colors_lhb <- plot_titles_lhb +
# geom_tile(aes(fill = contact_rate_l)) +
# scale_fill_gradientn(name = "Contact Rate",
# limits = c(0.5, 1),
# breaks = seq(from = 0.5, to = 1, by = 0.1),
# colors = c(brewer.pal(n = 7, name = "Reds")))
# Display plots side-by-side
# grid.arrange(plot_colors_rhb, plot_colors_lhb, ncol=2)
# Make RHB plot
# plot_contact_rhb <- plot_colors_rhb +
# annotate("text", x = locgrid$px, y = locgrid$pz,
# label = locgrid$contact_rate_r, size = 5)
# Make LHB plot
# plot_contact_lhb <- plot_colors_lhb +
# annotate("text", x = locgrid$px, y = locgrid$pz,
# label = locgrid$contact_rate_l, size = 5)
# Plot them side-by-side
# grid.arrange(plot_contact_rhb, plot_contact_lhb, ncol=2)
# Create pcontact
# pcontact <- subset(swings, contact == 1 & !is.na(batted_ball_velocity))
# Create pcontact_r
# pcontact_r <- subset(pcontact, batter_stand == "R")
# Create pcontact_l
# pcontact_l <- subset(pcontact, batter_stand == "L")
# Create exit_speed_r
# exit_speed_r <- data.frame(tapply(pcontact_r$batted_ball_velocity,
# pcontact_r$zone, mean))
# exit_speed_r <- round(exit_speed_r, 1)
# colnames(exit_speed_r) <- "exit_speed_rhb"
# exit_speed_r$zone <- row.names(exit_speed_r)
# Create exit_speed_l
# exit_speed_l <- data.frame(tapply(pcontact_l$batted_ball_velocity,
# pcontact_l$zone, mean))
# exit_speed_l <- round(exit_speed_l, 1)
# colnames(exit_speed_l) <- "exit_speed_lhb"
# exit_speed_l$zone <- row.names(exit_speed_l)
# Merge with locgrid
# locgrid <- merge(locgrid, exit_speed_r, by = "zone", all.x = T)
# locgrid <- merge(locgrid, exit_speed_l, by = "zone", all.x = T)
# Print locgrid
# locgrid
# Create RHB exit speed plotting object
# plot_exit_rhb <- plot_base_grid +
# geom_tile(data = locgrid, aes(fill = exit_speed_rhb)) +
# scale_fill_gradientn(name = "Exit Speed (mph)",
# limits = c(60, 95),
# breaks = seq(from = 60, to = 95, by = 5),
# colors = c(brewer.pal(n = 7, name = "Reds"))) +
# annotate("text", x = locgrid$px, y = locgrid$pz,
# label = locgrid$exit_speed_rhb, size = 5) +
# ggtitle("RHB Exit Velocity (mph)") +
# labs(x = "Horizontal Location(ft.; Catcher's View)",
# y = "Vertical Location (ft.)") +
# theme(plot.title = element_text(size = 15))
# Create LHB exit speed plotting object
# plot_exit_lhb <- plot_base_grid +
# geom_tile(data = locgrid, aes(fill = exit_speed_lhb)) +
# scale_fill_gradientn(name = "Exit Speed (mph)",
# limits = c(60, 95),
# breaks = seq(from = 60, to = 95, by = 5),
# colors = c(brewer.pal(n = 7, name = "Reds"))) +
# annotate("text", x = locgrid$px, y = locgrid$pz,
# label = locgrid$exit_speed_lhb, size = 5) +
# ggtitle("LHB Exit Velocity (mph)") +
# labs(x = "Horizontal Location(ft.; Catcher's View)",
# y = "Vertical Location (ft.)") +
# theme(plot.title = element_text(size = 15))
# Plot each side-by-side
# grid.arrange(plot_exit_rhb, plot_exit_lhb, ncol=2)
# Examine head() and tail() of exit_tidy
# head(exit_tidy)
# tail(exit_tidy)
# Create plot_exit
# plot_exit <- plot_base_grid +
# geom_tile(data = exit_tidy, aes(fill = exit_speed)) +
# scale_fill_gradientn(name = "Exit Speed (mph)",
# colors = c(brewer.pal(n = 7, name = "Reds"))) +
# ggtitle("Exit Speed (mph)") +
# labs(x = "Horizontal Location(ft.; Catcher's View)",
# y = "Vertical Location (ft.)") +
# theme(plot.title = element_text(size = 15)) +
# facet_grid(. ~ batter_stand)
# Display plot_exit
# plot_exit
cat("\n\nDo not have the data to run the associated code\n\n")
##
##
## Do not have the data to run the associated code
Chapter 1 - Language of Data
Examining the “High School and Beyond” data frame - one observation per row, one variable per column:
Types of variables - take note of the dimensions first:
Categorical data in R - factors:
Discretize a variable - convert numerical variable to categorical variable:
Visualizing numerical data - good first step of any exploratory data analysis (picture is worth 1000 words):
Example code includes:
# Load data
data(email50, package="openintro")
# View its structure
str(email50)
## 'data.frame': 50 obs. of 21 variables:
## $ spam : num 0 0 1 0 0 0 0 0 0 0 ...
## $ to_multiple : num 0 0 0 0 0 0 0 0 0 0 ...
## $ from : num 1 1 1 1 1 1 1 1 1 1 ...
## $ cc : int 0 0 4 0 0 0 0 0 1 0 ...
## $ sent_email : num 1 0 0 0 0 0 0 1 1 0 ...
## $ time : POSIXct, format: "2012-01-04 07:19:16" "2012-02-16 14:10:06" ...
## $ image : num 0 0 0 0 0 0 0 0 0 0 ...
## $ attach : num 0 0 2 0 0 0 0 0 0 0 ...
## $ dollar : num 0 0 0 0 9 0 0 0 0 23 ...
## $ winner : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ inherit : num 0 0 0 0 0 0 0 0 0 0 ...
## $ viagra : num 0 0 0 0 0 0 0 0 0 0 ...
## $ password : num 0 0 0 0 1 0 0 0 0 0 ...
## $ num_char : num 21.705 7.011 0.631 2.454 41.623 ...
## $ line_breaks : int 551 183 28 61 1088 5 17 88 242 578 ...
## $ format : num 1 1 0 0 1 0 0 1 1 1 ...
## $ re_subj : num 1 0 0 0 0 0 0 1 1 0 ...
## $ exclaim_subj: num 0 0 0 0 0 0 0 0 1 0 ...
## $ urgent_subj : num 0 0 0 0 0 0 0 0 0 0 ...
## $ exclaim_mess: num 8 1 2 1 43 0 0 2 22 3 ...
## $ number : Factor w/ 3 levels "none","small",..: 2 3 1 2 2 2 2 2 2 2 ...
# Glimpse email50
glimpse(email50)
## Observations: 50
## Variables: 21
## $ spam <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0...
## $ to_multiple <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0...
## $ from <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ cc <int> 0, 0, 4, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0...
## $ sent_email <dbl> 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1...
## $ time <dttm> 2012-01-04 07:19:16, 2012-02-16 14:10:06, 2012-0...
## $ image <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ attach <dbl> 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0...
## $ dollar <dbl> 0, 0, 0, 0, 9, 0, 0, 0, 0, 23, 4, 0, 3, 2, 0, 0, ...
## $ winner <fctr> no, no, no, no, no, no, no, no, no, no, no, no, ...
## $ inherit <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ viagra <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ password <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0...
## $ num_char <dbl> 21.705, 7.011, 0.631, 2.454, 41.623, 0.057, 0.809...
## $ line_breaks <int> 551, 183, 28, 61, 1088, 5, 17, 88, 242, 578, 1167...
## $ format <dbl> 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1...
## $ re_subj <dbl> 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1...
## $ exclaim_subj <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0...
## $ urgent_subj <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ exclaim_mess <dbl> 8, 1, 2, 1, 43, 0, 0, 2, 22, 3, 13, 1, 2, 2, 21, ...
## $ number <fctr> small, big, none, small, small, small, small, sm...
# Subset of emails with big numbers: email50_big
email50_big <- email50 %>%
filter(number == "big")
# Glimpse the subset
glimpse(email50_big)
## Observations: 7
## Variables: 21
## $ spam <dbl> 0, 0, 1, 0, 0, 0, 0
## $ to_multiple <dbl> 0, 0, 0, 0, 0, 0, 0
## $ from <dbl> 1, 1, 1, 1, 1, 1, 1
## $ cc <int> 0, 0, 0, 0, 0, 0, 0
## $ sent_email <dbl> 0, 0, 0, 0, 0, 1, 0
## $ time <dttm> 2012-02-16 14:10:06, 2012-02-04 17:26:09, 2012-0...
## $ image <dbl> 0, 0, 0, 0, 0, 0, 0
## $ attach <dbl> 0, 0, 0, 0, 0, 0, 0
## $ dollar <dbl> 0, 0, 3, 2, 0, 0, 0
## $ winner <fctr> no, no, yes, no, no, no, no
## $ inherit <dbl> 0, 0, 0, 0, 0, 0, 0
## $ viagra <dbl> 0, 0, 0, 0, 0, 0, 0
## $ password <dbl> 0, 2, 0, 0, 0, 0, 8
## $ num_char <dbl> 7.011, 10.368, 42.793, 26.520, 6.563, 11.223, 10.613
## $ line_breaks <int> 183, 198, 712, 692, 140, 512, 225
## $ format <dbl> 1, 1, 1, 1, 1, 1, 1
## $ re_subj <dbl> 0, 0, 0, 0, 0, 0, 0
## $ exclaim_subj <dbl> 0, 0, 0, 1, 0, 0, 0
## $ urgent_subj <dbl> 0, 0, 0, 0, 0, 0, 0
## $ exclaim_mess <dbl> 1, 1, 2, 7, 2, 9, 9
## $ number <fctr> big, big, big, big, big, big, big
# Table of number variable
table(email50_big$number)
##
## none small big
## 0 0 7
# Drop levels
email50_big$number <- droplevels(email50_big$number)
# Another table of number variable
table(email50_big$number)
##
## big
## 7
# Calculate median number of characters: med_num_char
# Note that wrapping in () also prints the variable
(med_num_char <- median(email50$num_char))
## [1] 6.8895
# Create num_char_cat variable in email50
email50 <- email50 %>%
mutate(num_char_cat = ifelse(num_char < med_num_char, "below median", "at or above median"))
# Count emails in each category
table(email50$num_char_cat)
##
## at or above median below median
## 25 25
# Create number_yn column in email50
email50 <- email50 %>%
mutate(number_yn = ifelse(number == "none", "no", "yes"))
# Visualize number_yn
ggplot(email50, aes(x = number_yn)) +
geom_bar()
# Scatterplot of exclaim_mess vs. num_char
ggplot(email50, aes(x = num_char, y = exclaim_mess, color = factor(spam))) +
geom_point()
Chapter 2 - Study Types and Cautions
Observational studies and experiments - study types, and scopes of inferences:
Random sampling and random assignment:
Simpson’s paradox - when a confounder interferes with understanding response (y) variables and exlanatory (x1, x2, etc.) variables:
Example code includes:
# Load data
data(gapminder, package="gapminder")
# Glimpse data
glimpse(gapminder)
## Observations: 1,704
## Variables: 6
## $ country <fctr> Afghanistan, Afghanistan, Afghanistan, Afghanistan,...
## $ continent <fctr> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asi...
## $ year <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992...
## $ lifeExp <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438, 39.8...
## $ pop <int> 8425333, 9240934, 10267083, 11537966, 13079460, 1488...
## $ gdpPercap <dbl> 779.4453, 820.8530, 853.1007, 836.1971, 739.9811, 78...
# Identify type of study
type_of_study <- "observational"
dfUCB <- as.data.frame(UCBAdmissions)
ucb_admit <- data.frame(Admit=factor(rep(dfUCB$Admit, times=dfUCB$Freq)),
Gender=factor(rep(dfUCB$Gender, times=dfUCB$Freq)),
Dept=as.character(rep(dfUCB$Dept, times=dfUCB$Freq)),
stringsAsFactors=FALSE
)
str(ucb_admit)
## 'data.frame': 4526 obs. of 3 variables:
## $ Admit : Factor w/ 2 levels "Admitted","Rejected": 1 1 1 1 1 1 1 1 1 1 ...
## $ Gender: Factor w/ 2 levels "Male","Female": 1 1 1 1 1 1 1 1 1 1 ...
## $ Dept : chr "A" "A" "A" "A" ...
# Count number of male and female applicants admitted
ucb_counts <- ucb_admit %>%
count(Admit, Gender)
# View result
ucb_counts
## Source: local data frame [4 x 3]
## Groups: Admit [?]
##
## Admit Gender n
## <fctr> <fctr> <int>
## 1 Admitted Male 1198
## 2 Admitted Female 557
## 3 Rejected Male 1493
## 4 Rejected Female 1278
# Spread the output across columns
ucb_counts %>%
tidyr::spread(Admit, n)
## # A tibble: 2 × 3
## Gender Admitted Rejected
## * <fctr> <int> <int>
## 1 Male 1198 1493
## 2 Female 557 1278
ucb_admit %>%
# Table of counts of admission status and gender
count(Admit, Gender) %>%
# Spread output across columns based on admission status
tidyr::spread(Admit, n) %>%
# Create new variable
mutate(Perc_Admit = Admitted / (Admitted + Rejected))
## # A tibble: 2 × 4
## Gender Admitted Rejected Perc_Admit
## <fctr> <int> <int> <dbl>
## 1 Male 1198 1493 0.4451877
## 2 Female 557 1278 0.3035422
# Table of counts of admission status and gender for each department
admit_by_dept <- ucb_admit %>%
count(Dept, Gender, Admit) %>%
tidyr::spread(Admit, n)
# View result
admit_by_dept
## Source: local data frame [12 x 4]
## Groups: Dept, Gender [12]
##
## Dept Gender Admitted Rejected
## * <chr> <fctr> <int> <int>
## 1 A Male 512 313
## 2 A Female 89 19
## 3 B Male 353 207
## 4 B Female 17 8
## 5 C Male 120 205
## 6 C Female 202 391
## 7 D Male 138 279
## 8 D Female 131 244
## 9 E Male 53 138
## 10 E Female 94 299
## 11 F Male 22 351
## 12 F Female 24 317
# Percentage of males admitted for each department
admit_by_dept %>%
mutate(Perc_Admit = Admitted / (Admitted + Rejected))
## Source: local data frame [12 x 5]
## Groups: Dept, Gender [12]
##
## Dept Gender Admitted Rejected Perc_Admit
## <chr> <fctr> <int> <int> <dbl>
## 1 A Male 512 313 0.62060606
## 2 A Female 89 19 0.82407407
## 3 B Male 353 207 0.63035714
## 4 B Female 17 8 0.68000000
## 5 C Male 120 205 0.36923077
## 6 C Female 202 391 0.34064081
## 7 D Male 138 279 0.33093525
## 8 D Female 131 244 0.34933333
## 9 E Male 53 138 0.27748691
## 10 E Female 94 299 0.23918575
## 11 F Male 22 351 0.05898123
## 12 F Female 24 317 0.07038123
Chapter 3 - Sampling Strategies and Experimental Design
Sampling strategies:
Sampling in R:
Principles of experimental design:
Example code includes:
usrState <- "Connecticut ; Maine ; Massachusetts ; New Hampshire ; Rhode Island ; Vermont ; New Jersey ; New York ; Pennsylvania ; Illinois ; Indiana ; Michigan ; Ohio ; Wisconsin ; Iowa ; Kansas ; Minnesota ; Missouri ; Nebraska ; North Dakota ; South Dakota ; Delaware ; Florida ; Georgia ; Maryland ; North Carolina ; South Carolina ; Virginia ; District of Columbia ; West Virginia ; Alabama ; Kentucky ; Mississippi ; Tennessee ; Arkansas ; Louisiana ; Oklahoma ; Texas ; Arizona ; Colorado ; Idaho ; Montana ; Nevada ; New Mexico ; Utah ; Wyoming ; Alaska ; California ; Hawaii ; Oregon ; Washington"
usrRegion <- "Northeast ; Northeast ; Northeast ; Northeast ; Northeast ; Northeast ; Northeast ; Northeast ; Northeast ; Midwest ; Midwest ; Midwest ; Midwest ; Midwest ; Midwest ; Midwest ; Midwest ; Midwest ; Midwest ; Midwest ; Midwest ; South ; South ; South ; South ; South ; South ; South ; South ; South ; South ; South ; South ; South ; South ; South ; South ; South ; West ; West ; West ; West ; West ; West ; West ; West ; West ; West ; West ; West ; West"
us_regions <- data.frame(state=factor(strsplit(usrState, " ; ")[[1]]),
region=factor(strsplit(usrRegion, " ; ")[[1]])
)
# Simple random sample: states_srs
states_srs <- us_regions %>%
dplyr::sample_n(size=8)
# Count states by region
states_srs %>%
group_by(region) %>%
count()
## # A tibble: 4 × 2
## region n
## <fctr> <int>
## 1 Midwest 3
## 2 Northeast 2
## 3 South 1
## 4 West 2
# Stratified sample
states_str <- us_regions %>%
group_by(region) %>%
dplyr::sample_n(size=2)
# Count states by region
states_str %>%
group_by(region) %>%
count()
## # A tibble: 4 × 2
## region n
## <fctr> <int>
## 1 Midwest 2
## 2 Northeast 2
## 3 South 2
## 4 West 2
Chapter 4 - Case Study
Data will be from a study titled “Beauty in the Classroom”:
Variables in the data:
Example code includes:
# NEED DATASET
evStudents <- "43 ; 125 ; 125 ; 123 ; 20 ; 40 ; 44 ; 55 ; 195 ; 46 ; 27 ; 25 ; 20 ; 25 ; 42 ; 20 ; 18 ; 48 ; 44 ; 48 ; 45 ; 59 ; 87 ; 282 ; 292 ; 130 ; 285 ; 272 ; 286 ; 302 ; 41 ; 34 ; 41 ; 41 ; 34 ; 41 ; 22 ; 21 ; 17 ; 30 ; 23 ; 20 ; 60 ; 33 ; 44 ; 49 ; 29 ; 48 ; 40 ; 19 ; 16 ; 15 ; 23 ; 11 ; 29 ; 21 ; 18 ; 19 ; 20 ; 25 ; 33 ; 24 ; 34 ; 21 ; 30 ; 25 ; 35 ; 40 ; 30 ; 42 ; 57 ; 57 ; 51 ; 30 ; 36 ; 37 ; 29 ; 27 ; 28 ; 52 ; 26 ; 30 ; 33 ; 177 ; 199 ; 32 ; 37 ; 161 ; 41 ; 44 ; 53 ; 49 ; 32 ; 135 ; 33 ; 19 ; 111 ; 149 ; 27 ; 136 ; 140 ; 31 ; 15 ; 29 ; 25 ; 18 ; 45 ; 15 ; 38 ; 15 ; 28 ; 23 ; 19 ; 23 ; 22 ; 20 ; 19 ; 23 ; 22 ; 15 ; 22 ; 31 ; 21 ; 36 ; 19 ; 37 ; 26 ; 39 ; 184 ; 50 ; 157 ; 164 ; 24 ; 68 ; 47 ; 14 ; 15 ; 24 ; 39 ; 26 ; 40 ; 159 ; 151 ; 47 ; 122 ; 45 ; 16 ; 23 ; 16 ; 18 ; 16 ; 15 ; 28 ; 17 ; 13 ; 21 ; 17 ; 134 ; 48 ; 64 ; 69 ; 12 ; 43 ; 14 ; 15 ; 18 ; 16 ; 10 ; 47 ; 15 ; 14 ; 12 ; 246 ; 316 ; 15 ; 15 ; 29 ; 21 ; 8 ; 16 ; 26 ; 10 ; 26 ; 26 ; 26 ; 21 ; 12 ; 27 ; 27 ; 25 ; 15 ; 15 ; 17 ; 55 ; 48 ; 21 ; 39 ; 27 ; 14 ; 26 ; 16 ; 16 ; 13 ; 14 ; 17 ; 13 ; 15 ; 10 ; 34 ; 16 ; 14 ; 12 ; 39 ; 35 ; 45 ; 45 ; 17 ; 14 ; 14 ; 14 ; 12 ; 15 ; 51 ; 23 ; 57 ; 50 ; 24 ; 23 ; 23 ; 28 ; 45 ; 42 ; 57 ; 27 ; 38 ; 22 ; 43 ; 31 ; 13 ; 15 ; 34 ; 19 ; 20 ; 23 ; 27 ; 32 ; 21 ; 24 ; 21 ; 28 ; 29 ; 67 ; 89 ; 82 ; 122 ; 131 ; 114 ; 149 ; 23 ; 98 ; 27 ; 30 ; 30 ; 69 ; 15 ; 10 ; 11 ; 14 ; 11 ; 14 ; 77 ; 41 ; 88 ; 78 ; 65 ; 157 ; 68 ; 67 ; 80 ; 137 ; 69 ; 91 ; 80 ; 90 ; 34 ; 73 ; 44 ; 36 ; 20 ; 35 ; 248 ; 168 ; 247 ; 22 ; 103 ; 62 ; 82 ; 51 ; 35 ; 34 ; 37 ; 14 ; 266 ; 254 ; 13 ; 282 ; 17 ; 19 ; 42 ; 27 ; 16 ; 19 ; 86 ; 29 ; 88 ; 98 ; 44 ; 65 ; 63 ; 75 ; 43 ; 80 ; 52 ; 48 ; 66 ; 100 ; 11 ; 16 ; 22 ; 11 ; 10 ; 16 ; 16 ; 10 ; 32 ; 10 ; 16 ; 67 ; 22 ; 28 ; 30 ; 15 ; 13 ; 18 ; 26 ; 30 ; 14 ; 24 ; 22 ; 25 ; 26 ; 22 ; 26 ; 20 ; 22 ; 21 ; 21 ; 69 ; 65 ; 62 ; 67 ; 40 ; 45 ; 574 ; 579 ; 537 ; 581 ; 527 ; 87 ; 84 ; 79 ; 92 ; 24 ; 67 ; 103 ; 190 ; 68 ; 60 ; 64 ; 31 ; 62 ; 37 ; 13 ; 13 ; 15 ; 79 ; 13 ; 98 ; 97 ; 11 ; 78 ; 56 ; 20 ; 17 ; 20 ; 19 ; 26 ; 14 ; 18 ; 12 ; 19 ; 16 ; 16 ; 12 ; 17 ; 15 ; 16 ; 17 ; 21 ; 17 ; 10 ; 17 ; 17 ; 18 ; 16 ; 26 ; 18 ; 20 ; 17 ; 21 ; 21 ; 20 ; 20 ; 13 ; 16 ; 17 ; 18 ; 24 ; 20 ; 120 ; 155 ; 38 ; 70 ; 149 ; 137 ; 29 ; 55 ; 136 ; 96 ; 60 ; 108 ; 39 ; 15 ; 111 ; 17 ; 19 ; 27 ; 19 ; 13 ; 19 ; 22 ; 20 ; 27 ; 132 ; 127 ; 85 ; 101 ; 21 ; 86 ; 84 ; 67 ; 66 ; 35"
evScore <- "4.7 ; 4.1 ; 3.9 ; 4.8 ; 4.6 ; 4.3 ; 2.8 ; 4.1 ; 3.4 ; 4.5 ; 3.8 ; 4.5 ; 4.6 ; 3.9 ; 3.9 ; 4.3 ; 4.5 ; 4.8 ; 4.6 ; 4.6 ; 4.9 ; 4.6 ; 4.5 ; 4.4 ; 4.6 ; 4.7 ; 4.5 ; 4.8 ; 4.9 ; 4.5 ; 4.4 ; 4.3 ; 4.1 ; 4.2 ; 3.5 ; 3.4 ; 4.5 ; 4.4 ; 4.4 ; 2.5 ; 4.3 ; 4.5 ; 4.8 ; 4.8 ; 4.4 ; 4.7 ; 4.4 ; 4.7 ; 4.5 ; 4 ; 4.3 ; 4.4 ; 4.5 ; 5 ; 4.9 ; 4.6 ; 5 ; 4.7 ; 5 ; 3.6 ; 3.7 ; 4.3 ; 4.1 ; 4.2 ; 4.7 ; 4.7 ; 3.5 ; 4.1 ; 4.2 ; 4 ; 4 ; 3.9 ; 4.4 ; 3.8 ; 3.5 ; 4.2 ; 3.5 ; 3.6 ; 2.9 ; 3.3 ; 3.3 ; 3.2 ; 4.6 ; 4.2 ; 4.3 ; 4.4 ; 4.1 ; 4.6 ; 4.4 ; 4.8 ; 4.3 ; 3.6 ; 4.3 ; 4 ; 4.2 ; 4.1 ; 4.1 ; 4.4 ; 4.3 ; 4.4 ; 4.4 ; 4.9 ; 5 ; 4.4 ; 4.8 ; 4.9 ; 4.3 ; 5 ; 4.7 ; 4.5 ; 3.5 ; 3.9 ; 4 ; 4 ; 3.7 ; 3.4 ; 3.3 ; 3.8 ; 3.9 ; 3.4 ; 3.7 ; 4.1 ; 3.7 ; 3.5 ; 3.5 ; 4.4 ; 3.4 ; 4.3 ; 3.7 ; 4.7 ; 3.9 ; 3.6 ; 4.5 ; 4.5 ; 4.8 ; 4.8 ; 4.7 ; 4.5 ; 4.3 ; 4.8 ; 4.1 ; 4.4 ; 4.3 ; 3.6 ; 4.5 ; 4.3 ; 4.4 ; 4.7 ; 4.8 ; 3.5 ; 3.8 ; 3.6 ; 4.2 ; 3.6 ; 4.4 ; 3.7 ; 4.3 ; 4.6 ; 4.6 ; 4.1 ; 3.6 ; 2.3 ; 4.3 ; 4.4 ; 3.6 ; 4.4 ; 3.9 ; 3.8 ; 3.4 ; 4.9 ; 4.1 ; 3.2 ; 4.2 ; 3.9 ; 4.9 ; 4.7 ; 4.4 ; 4.2 ; 4 ; 4.4 ; 3.9 ; 4.4 ; 3 ; 3.5 ; 2.8 ; 4.6 ; 4.3 ; 3.4 ; 3 ; 4.2 ; 4.3 ; 4.1 ; 4.6 ; 3.9 ; 3.5 ; 4 ; 4 ; 3.9 ; 3.3 ; 4 ; 3.8 ; 4.2 ; 4 ; 3.8 ; 3.3 ; 4.1 ; 4.7 ; 4.4 ; 4.8 ; 4.8 ; 4.6 ; 4.6 ; 4.8 ; 4.4 ; 4.7 ; 4.7 ; 3.3 ; 4.4 ; 4.3 ; 4.9 ; 4.4 ; 4.7 ; 4.3 ; 4.8 ; 4.5 ; 4.7 ; 3.3 ; 4.7 ; 4.6 ; 3.6 ; 4 ; 4.1 ; 4 ; 4.5 ; 4.6 ; 4.8 ; 4.6 ; 4.9 ; 3.1 ; 3.7 ; 3.7 ; 3.9 ; 3.9 ; 3.2 ; 4.4 ; 4.2 ; 4.7 ; 3.9 ; 3.6 ; 3.4 ; 4.4 ; 4.4 ; 4.1 ; 3.6 ; 3.5 ; 4.1 ; 3.8 ; 4 ; 4.8 ; 4.2 ; 4.6 ; 4.3 ; 4.8 ; 3.8 ; 4.5 ; 4.9 ; 4.9 ; 4.8 ; 4.7 ; 4.6 ; 4.3 ; 4.4 ; 4.5 ; 4.2 ; 4.8 ; 4.6 ; 4.9 ; 4.8 ; 4.8 ; 4.6 ; 4.7 ; 4.1 ; 3.8 ; 4 ; 4.1 ; 4 ; 4.1 ; 3.5 ; 4.1 ; 3.6 ; 4 ; 3.9 ; 3.8 ; 4.4 ; 4.7 ; 3.8 ; 4.1 ; 4.1 ; 4.7 ; 4.3 ; 4.4 ; 4.5 ; 3.1 ; 3.7 ; 4.5 ; 3 ; 4.6 ; 3.7 ; 3.6 ; 3.2 ; 3.3 ; 2.9 ; 4.2 ; 4.5 ; 3.8 ; 3.7 ; 3.7 ; 4 ; 3.7 ; 4.5 ; 3.8 ; 3.9 ; 4.6 ; 4.5 ; 4.2 ; 4 ; 3.8 ; 3.5 ; 2.7 ; 4 ; 4.6 ; 3.9 ; 4.5 ; 3.7 ; 2.4 ; 3.1 ; 2.5 ; 3 ; 4.5 ; 4.8 ; 4.9 ; 4.5 ; 4.6 ; 4.5 ; 4.9 ; 4.4 ; 4.6 ; 4.6 ; 5 ; 4.9 ; 4.6 ; 4.8 ; 4.9 ; 4.9 ; 4.9 ; 5 ; 4.5 ; 3.5 ; 3.8 ; 3.9 ; 3.9 ; 4.2 ; 4.1 ; 4.8 ; 4.8 ; 4.8 ; 4.8 ; 4.9 ; 4.2 ; 4.5 ; 3.9 ; 4.4 ; 4 ; 3.6 ; 3.7 ; 2.7 ; 4.5 ; 4.4 ; 3.9 ; 3.6 ; 4.4 ; 4.4 ; 4.7 ; 4.5 ; 4.1 ; 3.7 ; 4.3 ; 3.5 ; 3.7 ; 4 ; 4 ; 3.1 ; 4.5 ; 4.8 ; 4.2 ; 4.9 ; 4.8 ; 3.5 ; 3.6 ; 4.4 ; 3.4 ; 3.9 ; 3.8 ; 4.8 ; 4.6 ; 5 ; 3.8 ; 4.2 ; 3.3 ; 4.7 ; 4.6 ; 4.6 ; 4 ; 4.2 ; 4.9 ; 4.5 ; 4.8 ; 3.8 ; 4.8 ; 5 ; 5 ; 4.9 ; 4.6 ; 5 ; 4.8 ; 4.9 ; 4.9 ; 3.9 ; 3.9 ; 4.5 ; 4.5 ; 3.3 ; 3.1 ; 2.8 ; 3.1 ; 4.2 ; 3.4 ; 3 ; 3.3 ; 3.6 ; 3.7 ; 3.6 ; 4.3 ; 4.1 ; 4.9 ; 4.8 ; 3.7 ; 3.9 ; 4.5 ; 3.6 ; 4.4 ; 3.4 ; 4.4 ; 4.5 ; 4.5 ; 4.5 ; 4.6 ; 4.1 ; 4.5 ; 3.5 ; 4.4 ; 4.4 ; 4.1"
evBty <- "5 ; 5 ; 5 ; 5 ; 3 ; 3 ; 3 ; 3.3 ; 3.3 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 7.3 ; 7.3 ; 7.3 ; 7.3 ; 7.3 ; 7.3 ; 5.5 ; 5.5 ; 5.5 ; 5.5 ; 5.5 ; 5.5 ; 5.5 ; 4.2 ; 4.2 ; 4.2 ; 4.2 ; 4.2 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4.7 ; 4.7 ; 4.7 ; 4.7 ; 4.7 ; 4.7 ; 4.7 ; 5.5 ; 5.5 ; 5.5 ; 5.5 ; 5.5 ; 5.5 ; 5.5 ; 5.5 ; 5.5 ; 5.5 ; 4.8 ; 4.8 ; 4.8 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.8 ; 4.8 ; 4.8 ; 4.8 ; 4.8 ; 4.8 ; 4.8 ; 4 ; 4 ; 4 ; 4 ; 5.5 ; 5.5 ; 5.5 ; 5.5 ; 4.2 ; 4.2 ; 4.2 ; 4.2 ; 4.2 ; 4.2 ; 2.5 ; 2.5 ; 2.5 ; 2.5 ; 2.5 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.8 ; 4.8 ; 4.8 ; 4.8 ; 4.8 ; 4.8 ; 2.8 ; 3 ; 3 ; 3 ; 3 ; 3 ; 4.2 ; 4.2 ; 4.2 ; 4.2 ; 4.2 ; 4.2 ; 4.2 ; 7.8 ; 7.8 ; 3.8 ; 3.8 ; 3.8 ; 3.8 ; 3.8 ; 4.8 ; 4.8 ; 4.8 ; 4.8 ; 4.8 ; 4.8 ; 4.8 ; 3 ; 3 ; 3 ; 3 ; 3 ; 3 ; 3 ; 3 ; 5.2 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 2.7 ; 2.7 ; 2.7 ; 5.5 ; 5.5 ; 5.5 ; 5.5 ; 5.5 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 2.3 ; 2.3 ; 2.3 ; 6.5 ; 6.5 ; 6.5 ; 6.5 ; 2.3 ; 2.3 ; 2.3 ; 2.3 ; 2.3 ; 2.3 ; 2.3 ; 2.3 ; 3 ; 3 ; 3 ; 3.7 ; 3.7 ; 3.7 ; 3.7 ; 3.7 ; 3.7 ; 3.7 ; 3.7 ; 6.2 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4.8 ; 4.8 ; 4.8 ; 4.8 ; 8.2 ; 8.2 ; 8.2 ; 8.2 ; 6.5 ; 6.5 ; 6.5 ; 4.8 ; 4.8 ; 4.8 ; 4.8 ; 7 ; 7 ; 7 ; 4.7 ; 3.8 ; 3.8 ; 3.8 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 5.8 ; 5.8 ; 5.8 ; 5.8 ; 5.8 ; 5.8 ; 5.7 ; 5.7 ; 5.7 ; 5.7 ; 6.5 ; 6.5 ; 6.5 ; 6.5 ; 6.5 ; 6.5 ; 6.5 ; 1.7 ; 1.7 ; 1.7 ; 1.7 ; 1.7 ; 1.7 ; 6.7 ; 6.7 ; 6.7 ; 3.7 ; 3.7 ; 3.7 ; 3.8 ; 3.8 ; 6.2 ; 6.2 ; 3.3 ; 3.3 ; 3.3 ; 3.3 ; 3.3 ; 3.3 ; 3.3 ; 3.3 ; 3.3 ; 3.3 ; 3.7 ; 3.7 ; 3.5 ; 3.5 ; 3.5 ; 2.7 ; 5.7 ; 6 ; 6 ; 6.5 ; 6.5 ; 6.5 ; 2.3 ; 2.3 ; 2.3 ; 2.3 ; 2.3 ; 2.3 ; 2.3 ; 2.3 ; 2.3 ; 2.3 ; 2.3 ; 2.3 ; 2.3 ; 7.2 ; 7.2 ; 1.7 ; 1.7 ; 1.7 ; 5.2 ; 3.5 ; 3.5 ; 3.5 ; 3.5 ; 3.5 ; 3.5 ; 3.5 ; 3.5 ; 3.5 ; 3.3 ; 3.3 ; 3.3 ; 3.3 ; 3.3 ; 3.3 ; 3.3 ; 3.3 ; 3.3 ; 3.3 ; 5.8 ; 5.8 ; 5.8 ; 5.8 ; 5.8 ; 5.8 ; 6.2 ; 6.2 ; 6.2 ; 6.2 ; 6.2 ; 3.3 ; 3.3 ; 3.3 ; 3.3 ; 5.2 ; 5.2 ; 4.2 ; 4.2 ; 2.5 ; 2.5 ; 2.5 ; 2.5 ; 2.5 ; 2.5 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 3 ; 3 ; 3 ; 6.3 ; 6.3 ; 6.3 ; 6.3 ; 3.3 ; 3.3 ; 3.3 ; 3.3 ; 2.8 ; 2.8 ; 2.8 ; 2.8 ; 2.8 ; 2.8 ; 2.8 ; 2.8 ; 2.8 ; 2.8 ; 2.8 ; 6.7 ; 6.7 ; 6.7 ; 6.7 ; 6.7 ; 6.8 ; 6.8 ; 6.8 ; 6.8 ; 6.8 ; 7.8 ; 7.8 ; 7.8 ; 7.8 ; 7.8 ; 7.8 ; 7.8 ; 7.8 ; 7.8 ; 7.8 ; 7.8 ; 5.8 ; 5.8 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 7.8 ; 7.8 ; 7.8 ; 3.3 ; 3.3 ; 4.5 ; 4.5 ; 4.5 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 6.8 ; 6.8 ; 6.8 ; 6.8 ; 6.8 ; 6.8 ; 5.3 ; 5.3 ; 5.3 ; 5.3"
evals <- data.frame(score=as.numeric(strsplit(evScore, " ; ")[[1]]),
cls_students=as.integer(strsplit(evStudents, " ; ")[[1]]),
bty_avg=as.numeric(strsplit(evBty, " ; ")[[1]])
)
# Inspect evals
glimpse(evals)
## Observations: 463
## Variables: 3
## $ score <dbl> 4.7, 4.1, 3.9, 4.8, 4.6, 4.3, 2.8, 4.1, 3.4, 4.5,...
## $ cls_students <int> 43, 125, 125, 123, 20, 40, 44, 55, 195, 46, 27, 2...
## $ bty_avg <dbl> 5.0, 5.0, 5.0, 5.0, 3.0, 3.0, 3.0, 3.3, 3.3, 3.2,...
# Inspect variable types
glimpse(evals)
## Observations: 463
## Variables: 3
## $ score <dbl> 4.7, 4.1, 3.9, 4.8, 4.6, 4.3, 2.8, 4.1, 3.4, 4.5,...
## $ cls_students <int> 43, 125, 125, 123, 20, 40, 44, 55, 195, 46, 27, 2...
## $ bty_avg <dbl> 5.0, 5.0, 5.0, 5.0, 3.0, 3.0, 3.0, 3.3, 3.3, 3.2,...
# Remove non-factor variables from this vector
cat_vars <- c("rank", "ethnicity", "gender", "language",
"cls_level", "cls_profs", "cls_credits",
"pic_outfit", "pic_color")
# Recode cls_students as cls_type: evals
evals <- evals %>%
# Create new variable
mutate(cls_type = ifelse(cls_students <= 18, "small",
ifelse(cls_students >= 60, "large", "midsize")
)
)
# Scatterplot of score vs. bty_avg
ggplot(evals, aes(x=bty_avg, y=score)) +
geom_point()
# Scatterplot of score vs. bty_avg colored by cls_type
ggplot(data=evals, aes(x=bty_avg, y=score, color=cls_type)) +
geom_point()
Chapter 1 - Exploring categorical data
Exploring categorical data; based on a comic book dataset of DC vs Marvel:
Counts vs proportions - the proportions are often much more meaningful:
Distribution of one variable - the typical way to begin exploring a dataset:
Example code includes:
## ISSUE - do not have (and cannot find) this tibble
comCounts <- c(1573, 2490, 836, 1, 904, 7561, 4809, 1799, 2,
2250, 32, 17, 17, 0, 2, 449, 152, 121, 0, 257
)
comGender <- rep(rep(c("Female", "Male", "Other", NA), each=5),
times=comCounts
)
comAlign <- rep(rep(c("Bad", "Good", "Neutral", "Reformed Criminals", NA), times=4),
times=comCounts
)
comics <- tibble::as_tibble(data.frame(gender=factor(comGender),
align=factor(comAlign)
)
)
# Print the first rows of the data
comics
## # A tibble: 23,272 × 2
## gender align
## <fctr> <fctr>
## 1 Female Bad
## 2 Female Bad
## 3 Female Bad
## 4 Female Bad
## 5 Female Bad
## 6 Female Bad
## 7 Female Bad
## 8 Female Bad
## 9 Female Bad
## 10 Female Bad
## # ... with 23,262 more rows
# Check levels of align
levels(comics$align)
## [1] "Bad" "Good" "Neutral"
## [4] "Reformed Criminals"
# Check the levels of gender
levels(comics$gender)
## [1] "Female" "Male" "Other"
# Create a 2-way contingency table
table(comics$align, comics$gender)
##
## Female Male Other
## Bad 1573 7561 32
## Good 2490 4809 17
## Neutral 836 1799 17
## Reformed Criminals 1 2 0
# Remove align level
comics <- comics %>%
filter(align != "Reformed Criminals") %>%
droplevels()
# Create side-by-side barchart of gender by alignment
ggplot(comics, aes(x = align, fill = gender)) +
geom_bar(position = "dodge")
# Create side-by-side barchart of alignment by gender
ggplot(comics, aes(x = gender, fill = align)) +
geom_bar(position = "dodge") +
theme(axis.text.x = element_text(angle = 90))
# Plot of gender by align
ggplot(comics, aes(x = align, fill = gender)) +
geom_bar()
# Plot proportion of gender, conditional on align
ggplot(comics, aes(x = align, fill = gender)) +
geom_bar(position = "fill")
# Change the order of the levels in align
comics$align <- factor(comics$align,
levels = c("Bad", "Neutral", "Good"))
# Create plot of align
ggplot(comics, aes(x = align)) +
geom_bar()
# Plot of alignment broken down by gender
ggplot(comics, aes(x = align)) +
geom_bar() +
facet_wrap(~ gender)
pieFlavor <- "cherry ; cherry ; cherry ; cherry ; cherry ; cherry ; cherry ; cherry ; cherry ; cherry ; cherry ; cherry ; cherry ; key lime ; key lime ; key lime ; key lime ; key lime ; key lime ; key lime ; key lime ; key lime ; key lime ; key lime ; key lime ; key lime ; key lime ; key lime ; key lime ; boston creme ; boston creme ; boston creme ; boston creme ; boston creme ; boston creme ; boston creme ; boston creme ; boston creme ; boston creme ; boston creme ; boston creme ; boston creme ; boston creme ; boston creme ; strawberry ; strawberry ; strawberry ; strawberry ; strawberry ; strawberry ; strawberry ; strawberry ; strawberry ; strawberry ; strawberry ; blueberry ; blueberry ; blueberry ; blueberry ; blueberry ; blueberry ; blueberry ; blueberry ; blueberry ; blueberry ; blueberry ; blueberry ; blueberry ; blueberry ; apple ; apple ; apple ; apple ; apple ; apple ; apple ; apple ; apple ; apple ; apple ; apple ; apple ; apple ; apple ; apple ; apple ; pumpkin ; pumpkin ; pumpkin ; pumpkin ; pumpkin ; pumpkin ; pumpkin ; pumpkin ; pumpkin ; pumpkin ; pumpkin ; pumpkin"
pies <- data.frame(flavor=factor(strsplit(pieFlavor, " ; ")[[1]]))
# Garden variety pie chart
ggplot(pies, aes(x=factor(1), fill=flavor)) +
geom_bar(position = "fill") +
coord_polar(theta="y") +
labs(x='', y='')
# Put levels of flavor in decending order
lev <- c("apple", "key lime", "boston creme", "blueberry", "cherry", "pumpkin", "strawberry")
pies$flavor <- factor(pies$flavor, levels = lev)
# Create barchart of flavor
ggplot(pies, aes(x = flavor)) +
geom_bar(fill = "chartreuse") +
theme(axis.text.x = element_text(angle = 90))
# If you prefer that it still be multi-colored like the pie
ggplot(pies, aes(x = flavor)) +
geom_bar(aes(fill=flavor)) +
theme(axis.text.x = element_text(angle = 90))
Chapter 2 - Exploring numerical data
Exploring numerical data - cars that were available for sale in a given year (428 x 19 tbl_df):
Distribution of one variable:
Box plots are based around three charcateristics of the data:
Visualization in higher dimensions:
Example code includes:
# Time to create some data . . .
carCityMPG <- "28 ; 28 ; 26 ; 26 ; 26 ; 29 ; 29 ; 26 ; 27 ; 26 ; 26 ; 32 ; 36 ; 32 ; 29 ; 29 ; 29 ; 26 ; 26 ; 26 ; 23 ; 26 ; 25 ; 24 ; 24 ; 24 ; NA ; 28 ; NA ; NA ; 28 ; 28 ; 24 ; 26 ; 26 ; 26 ; 26 ; 26 ; 32 ; 25 ; 25 ; 24 ; 22 ; 32 ; 32 ; 32 ; 35 ; 33 ; 35 ; 20 ; 21 ; 24 ; 22 ; 21 ; 22 ; 22 ; 22 ; 21 ; 21 ; 21 ; 21 ; 21 ; 20 ; 19 ; 26 ; 26 ; 32 ; 26 ; 46 ; 60 ; 19 ; 19 ; 20 ; NA ; 24 ; 20 ; 25 ; NA ; NA ; 21 ; 23 ; 24 ; 20 ; 20 ; 24 ; 20 ; 22 ; 21 ; 20 ; 24 ; 21 ; 24 ; 20 ; 59 ; 24 ; 24 ; 38 ; 24 ; 24 ; 22 ; 22 ; 20 ; 20 ; 20 ; 18 ; 20 ; 18 ; 23 ; 18 ; 18 ; 21 ; 19 ; 21 ; 22 ; 18 ; 17 ; 17 ; 21 ; 21 ; 17 ; 17 ; 18 ; 18 ; 18 ; 17 ; 22 ; 19 ; 17 ; 17 ; 19 ; 18 ; 18 ; 21 ; 20 ; 20 ; 20 ; 20 ; 21 ; 20 ; 19 ; 21 ; 21 ; 20 ; 21 ; 24 ; 22 ; 22 ; 20 ; 23 ; 20 ; 17 ; 18 ; 20 ; 18 ; 20 ; 19 ; 19 ; 20 ; 20 ; 20 ; 19 ; 20 ; 20 ; 18 ; 18 ; 21 ; 17 ; 18 ; 19 ; 18 ; 20 ; 18 ; 18 ; 20 ; 20 ; 20 ; 19 ; 19 ; 20 ; 19 ; 17 ; 17 ; NA ; 20 ; 20 ; 21 ; 21 ; 19 ; 21 ; 19 ; 18 ; 20 ; 20 ; 18 ; 20 ; 20 ; 18 ; 18 ; 20 ; 18 ; 18 ; 17 ; 17 ; 14 ; 19 ; 20 ; 18 ; 18 ; 18 ; 18 ; 18 ; 18 ; 18 ; 17 ; 17 ; 18 ; 18 ; 17 ; 18 ; 18 ; 17 ; 18 ; 18 ; 18 ; 17 ; 17 ; 17 ; 17 ; 17 ; 16 ; 16 ; 13 ; 20 ; 17 ; 19 ; 16 ; 18 ; 16 ; 21 ; 21 ; NA ; NA ; 21 ; 20 ; 19 ; 17 ; 15 ; 20 ; 20 ; 21 ; 16 ; 16 ; 20 ; 21 ; 17 ; 18 ; 18 ; 17 ; NA ; 20 ; 17 ; 17 ; 20 ; 19 ; 18 ; 18 ; 16 ; 16 ; 18 ; 23 ; 23 ; 18 ; 18 ; 16 ; 14 ; 13 ; 21 ; 17 ; 21 ; 21 ; 18 ; 20 ; 20 ; NA ; 18 ; 17 ; 18 ; 17 ; 20 ; 18 ; 20 ; 18 ; 24 ; 26 ; 14 ; 16 ; 14 ; 14 ; 15 ; NA ; 15 ; 15 ; 16 ; 13 ; 10 ; 15 ; 13 ; 13 ; 14 ; 17 ; 16 ; 16 ; 15 ; 19 ; 16 ; 15 ; 17 ; 17 ; 16 ; 16 ; 12 ; 15 ; 13 ; 18 ; 13 ; 13 ; 14 ; 16 ; 17 ; 15 ; 16 ; 19 ; 14 ; 21 ; 18 ; 18 ; 18 ; 13 ; 15 ; 15 ; 19 ; 18 ; 21 ; 21 ; 20 ; 20 ; 16 ; 12 ; 18 ; 22 ; 21 ; 17 ; 19 ; 22 ; 18 ; 15 ; 19 ; 22 ; 17 ; 26 ; 19 ; 16 ; 15 ; 26 ; 18 ; 19 ; 19 ; 16 ; 19 ; NA ; 20 ; 29 ; 19 ; 24 ; 31 ; 21 ; 21 ; 24 ; 29 ; 24 ; 22 ; 18 ; 22 ; 20 ; 14 ; 19 ; 19 ; 18 ; 20 ; 18 ; 17 ; 16 ; 18 ; 18 ; 16 ; 18 ; 16 ; 19 ; 18 ; 19 ; 19 ; 18 ; 19 ; 19 ; 13 ; 14 ; 18 ; 15 ; 13 ; 16 ; 16 ; 16 ; 16 ; 15 ; 14 ; 24 ; 19 ; 17 ; NA ; 15 ; 24 ; 15 ; 17 ; 14 ; 21 ; 22 ; 16 ; 14"
carSUV <- "0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0"
carNCyl <- "4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 6 ; 6 ; 4 ; 6 ; 6 ; 4 ; 4 ; 4 ; 6 ; 6 ; 4 ; 4 ; 4 ; 6 ; 6 ; 4 ; 4 ; 4 ; 4 ; 4 ; 3 ; 6 ; 6 ; 6 ; 4 ; 4 ; 6 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 6 ; 6 ; 4 ; 6 ; 4 ; 4 ; 6 ; 4 ; 6 ; 4 ; 6 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 4 ; 4 ; 6 ; 8 ; 8 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 4 ; 6 ; 8 ; 8 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 4 ; 4 ; 6 ; 6 ; 6 ; 6 ; 6 ; 4 ; 4 ; 4 ; 6 ; 4 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 8 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 8 ; 8 ; 8 ; 4 ; 4 ; 4 ; 4 ; 6 ; 6 ; 6 ; 8 ; 5 ; 5 ; 5 ; 6 ; 5 ; 6 ; 6 ; 6 ; 6 ; 6 ; 8 ; 8 ; 8 ; 6 ; 6 ; 8 ; 8 ; 8 ; 6 ; 8 ; 8 ; 8 ; 8 ; 8 ; 6 ; 8 ; 8 ; 8 ; 8 ; 8 ; 6 ; 8 ; 8 ; 8 ; 8 ; 8 ; 8 ; 8 ; 6 ; 8 ; 12 ; 6 ; 8 ; 6 ; 8 ; 8 ; 8 ; 4 ; 4 ; 8 ; 12 ; 5 ; 5 ; 6 ; 6 ; 8 ; 4 ; 4 ; 6 ; 6 ; 6 ; 6 ; 6 ; 8 ; 8 ; 8 ; 6 ; 10 ; 6 ; 8 ; 8 ; 4 ; 6 ; 8 ; 8 ; 8 ; 8 ; 8 ; 4 ; 4 ; -1 ; -1 ; 8 ; 8 ; 12 ; 4 ; 6 ; 6 ; 6 ; 4 ; 6 ; 6 ; 8 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 4 ; 4 ; 4 ; 4 ; 8 ; 8 ; 8 ; 8 ; 8 ; 10 ; 8 ; 6 ; 8 ; 8 ; 8 ; 6 ; 8 ; 8 ; 8 ; 6 ; 6 ; 8 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 8 ; 8 ; 8 ; 6 ; 8 ; 8 ; 8 ; 6 ; 6 ; 6 ; 6 ; 6 ; 8 ; 4 ; 6 ; 6 ; 6 ; 8 ; 6 ; 6 ; 6 ; 6 ; 4 ; 4 ; 6 ; 4 ; 6 ; 8 ; 6 ; 4 ; 4 ; 6 ; 6 ; 4 ; 6 ; 8 ; 6 ; 6 ; 6 ; 4 ; 6 ; 6 ; 8 ; 4 ; 6 ; 6 ; 6 ; 8 ; 6 ; 4 ; 6 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 8 ; 4 ; 5 ; 6 ; 6 ; 6 ; 6 ; 4 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 8 ; 8 ; 4 ; 6 ; 8 ; 8 ; 6 ; 6 ; 6 ; 8 ; 8 ; 4 ; 4 ; 8 ; 8 ; 6 ; 4 ; 6 ; 6 ; 8 ; 4 ; 4 ; 6 ; 6"
carHP <- "103 ; 103 ; 140 ; 140 ; 140 ; 132 ; 132 ; 130 ; 110 ; 130 ; 130 ; 115 ; 117 ; 115 ; 103 ; 103 ; 103 ; 138 ; 138 ; 138 ; 138 ; 104 ; 104 ; 124 ; 124 ; 124 ; 148 ; 115 ; 120 ; 120 ; 126 ; 126 ; 140 ; 140 ; 140 ; 140 ; 140 ; 140 ; 108 ; 155 ; 155 ; 119 ; 119 ; 130 ; 130 ; 130 ; 108 ; 108 ; 108 ; 175 ; 180 ; 145 ; 200 ; 180 ; 150 ; 150 ; 150 ; 200 ; 200 ; 150 ; 150 ; 170 ; 155 ; 201 ; 160 ; 160 ; 127 ; 160 ; 93 ; 73 ; 170 ; 170 ; 170 ; 160 ; 160 ; 155 ; 163 ; 160 ; 120 ; 175 ; 165 ; 140 ; 175 ; 200 ; 140 ; 182 ; 165 ; 165 ; 155 ; 157 ; 210 ; 157 ; 225 ; 110 ; 115 ; 180 ; 100 ; 150 ; 200 ; 200 ; 170 ; 184 ; 205 ; 200 ; 240 ; 200 ; 240 ; 200 ; 200 ; 250 ; 200 ; 232 ; 220 ; 150 ; 232 ; 224 ; 224 ; 240 ; 240 ; 194 ; 194 ; 260 ; 280 ; 192 ; 195 ; 189 ; 215 ; 224 ; 224 ; 201 ; 205 ; 230 ; 245 ; 265 ; 265 ; 170 ; 200 ; 165 ; 165 ; 212 ; 210 ; 210 ; 225 ; 200 ; 115 ; 170 ; 170 ; 270 ; 170 ; 220 ; 220 ; 220 ; 220 ; 220 ; 184 ; 184 ; 184 ; 225 ; 225 ; 225 ; 184 ; 205 ; 205 ; 255 ; 255 ; 200 ; 239 ; 260 ; 255 ; 227 ; 225 ; 215 ; 215 ; 232 ; 232 ; 168 ; 168 ; 215 ; 215 ; 215 ; 224 ; 302 ; 275 ; 210 ; 210 ; 220 ; 250 ; 212 ; 210 ; 190 ; 270 ; 208 ; 247 ; 300 ; 208 ; 194 ; 225 ; 225 ; 220 ; 220 ; 250 ; 300 ; 330 ; 340 ; 225 ; 225 ; 325 ; 325 ; 325 ; 240 ; 275 ; 300 ; 275 ; 340 ; 340 ; 235 ; 294 ; 390 ; 294 ; 294 ; 390 ; 220 ; 300 ; 290 ; 280 ; 280 ; 239 ; 239 ; 239 ; 349 ; 302 ; 493 ; 215 ; 302 ; 221 ; 302 ; 275 ; 302 ; 210 ; 210 ; 335 ; 420 ; 197 ; 242 ; 268 ; 290 ; 450 ; 180 ; 225 ; 250 ; 333 ; 333 ; 184 ; 225 ; 320 ; 350 ; 350 ; 215 ; 500 ; 193 ; 260 ; 280 ; 240 ; 172 ; 294 ; 294 ; 390 ; 390 ; 300 ; 142 ; 142 ; 197 ; 238 ; 302 ; 493 ; 493 ; 192 ; 349 ; 210 ; 210 ; 271 ; 287 ; 287 ; 340 ; 315 ; 315 ; 315 ; 477 ; 228 ; 258 ; 227 ; 300 ; 180 ; 138 ; 295 ; 320 ; 295 ; 295 ; 230 ; 310 ; 232 ; 275 ; 285 ; 325 ; 316 ; 275 ; 300 ; 305 ; 240 ; 265 ; 225 ; 325 ; 275 ; 185 ; 275 ; 210 ; 240 ; 193 ; 195 ; 192 ; 282 ; 235 ; 235 ; 230 ; 302 ; 292 ; 288 ; 210 ; 215 ; 215 ; 240 ; 185 ; 340 ; 143 ; 185 ; 245 ; 230 ; 325 ; 220 ; 268 ; 165 ; 201 ; 160 ; 160 ; 173 ; 150 ; 190 ; 217 ; 174 ; 130 ; 160 ; 180 ; 165 ; 161 ; 220 ; 340 ; 184 ; 200 ; 250 ; 130 ; 155 ; 280 ; 315 ; 104 ; 215 ; 168 ; 221 ; 302 ; 155 ; 160 ; 245 ; 130 ; 250 ; 140 ; 108 ; 165 ; 165 ; 155 ; 130 ; 115 ; 170 ; 270 ; 170 ; 208 ; 190 ; 185 ; 180 ; 215 ; 150 ; 215 ; 193 ; 190 ; 240 ; 240 ; 195 ; 200 ; 201 ; 240 ; 240 ; 185 ; 185 ; 185 ; 230 ; 230 ; 345 ; 295 ; 175 ; 200 ; 300 ; 300 ; 210 ; 210 ; 215 ; 231 ; 300 ; 143 ; 175 ; 285 ; 300 ; 190 ; 143 ; 207 ; 180 ; 305 ; 165 ; 142 ; 190 ; 190"
carMSRP <- "11690 ; 12585 ; 14610 ; 14810 ; 16385 ; 13670 ; 15040 ; 13270 ; 13730 ; 15460 ; 15580 ; 13270 ; 14170 ; 15850 ; 10539 ; 11839 ; 11939 ; 13839 ; 15389 ; 15389 ; 16040 ; 10280 ; 11155 ; 12360 ; 13580 ; 14630 ; 15500 ; 16999 ; 14622 ; 16722 ; 12740 ; 14740 ; 15495 ; 10995 ; 14300 ; 15825 ; 14850 ; 16350 ; 12965 ; 12884 ; 14500 ; 12269 ; 15568 ; 14085 ; 15030 ; 15295 ; 10760 ; 11560 ; 11290 ; 22180 ; 21900 ; 18995 ; 20370 ; 21825 ; 17985 ; 22000 ; 19090 ; 21840 ; 22035 ; 18820 ; 20220 ; 19135 ; 20320 ; 22735 ; 19860 ; 22260 ; 17750 ; 19490 ; 20140 ; 19110 ; 19339 ; 20339 ; 18435 ; 17200 ; 19270 ; 21595 ; 19999 ; 19312 ; 17232 ; 19240 ; 17640 ; 18825 ; 22450 ; 22395 ; 17735 ; 21410 ; 19945 ; 20445 ; 17262 ; 19560 ; 22775 ; 19635 ; 21965 ; 20510 ; 18715 ; 19825 ; 21055 ; 21055 ; 23820 ; 26990 ; 25940 ; 28495 ; 26470 ; 24895 ; 28345 ; 25000 ; 27995 ; 23495 ; 24225 ; 29865 ; 24130 ; 26860 ; 25955 ; 25215 ; 24885 ; 24345 ; 27370 ; 23760 ; 26960 ; 24589 ; 26189 ; 28495 ; 29795 ; 29995 ; 26000 ; 26060 ; 28370 ; 24695 ; 29595 ; 23895 ; 29282 ; 25700 ; 23290 ; 27490 ; 29440 ; 23675 ; 24295 ; 25645 ; 27145 ; 29345 ; 26560 ; 25920 ; 26510 ; 23785 ; 23215 ; 23955 ; 25135 ; 33195 ; 35940 ; 31840 ; 33430 ; 34480 ; 36640 ; 39640 ; 30795 ; 37995 ; 30245 ; 35495 ; 36995 ; 37245 ; 39995 ; 32245 ; 35545 ; 30835 ; 33295 ; 30950 ; 30315 ; 32445 ; 31145 ; 33995 ; 32350 ; 31045 ; 32415 ; 32495 ; 36895 ; 32280 ; 33480 ; 35920 ; 37630 ; 38830 ; 30895 ; 34495 ; 35995 ; 30860 ; 33360 ; 35105 ; 39465 ; 31545 ; 30920 ; 33180 ; 39235 ; 31745 ; 34845 ; 37560 ; 37730 ; 37885 ; 43755 ; 46100 ; 42490 ; 44240 ; 42840 ; 49690 ; 69190 ; 48040 ; 44295 ; 44995 ; 54995 ; 69195 ; 73195 ; 40720 ; 45445 ; 50595 ; 47955 ; 42845 ; 52545 ; 43895 ; 49995 ; 63120 ; 68995 ; 59995 ; 74995 ; 41010 ; 48450 ; 55750 ; 40095 ; 43495 ; 41815 ; 44925 ; 50470 ; 52120 ; 94820 ; 128420 ; 45707 ; 52800 ; 48170 ; 57270 ; 74320 ; 86970 ; 40670 ; 43175 ; 65000 ; 75000 ; 40565 ; 42565 ; 45210 ; 89765 ; 84600 ; 35940 ; 37390 ; 40590 ; 48195 ; 56595 ; 33895 ; 41045 ; 76200 ; 44535 ; 51535 ; 34495 ; 81795 ; 18345 ; 29380 ; 37530 ; 33260 ; 18739 ; 69995 ; 74995 ; 81995 ; 86995 ; 63200 ; 22388 ; 25193 ; 25700 ; 27200 ; 90520 ; 121770 ; 126670 ; 40320 ; 56170 ; 25092 ; 26992 ; 29562 ; 26910 ; 34390 ; 33500 ; 79165 ; 84165 ; 76765 ; 192465 ; 43365 ; 52365 ; 25045 ; 31545 ; 22570 ; 25130 ; 52795 ; 46995 ; 42735 ; 41465 ; 32235 ; 41475 ; 34560 ; 31890 ; 35725 ; 46265 ; 49995 ; 31849 ; 52775 ; 33840 ; 35695 ; 36945 ; 37000 ; 52195 ; 37895 ; 26545 ; 30295 ; 29670 ; 27560 ; 20449 ; 27905 ; 19635 ; 72250 ; 45700 ; 64800 ; 39195 ; 42915 ; 76870 ; 46470 ; 29995 ; 30492 ; 33112 ; 27339 ; 21595 ; 56665 ; 20585 ; 23699 ; 27710 ; 27930 ; 54765 ; 35515 ; 41250 ; 20255 ; 22515 ; 19860 ; 18690 ; 21589 ; 20130 ; 25520 ; 39250 ; 25995 ; 21087 ; 18892 ; 20939 ; 17163 ; 20290 ; 40840 ; 49090 ; 32845 ; 22225 ; 31230 ; 17475 ; 22290 ; 34895 ; 36395 ; 11905 ; 32455 ; 33780 ; 50670 ; 60670 ; 22595 ; 17495 ; 28739 ; 17045 ; 40845 ; 23560 ; 14165 ; 21445 ; 23895 ; 16497 ; 16695 ; 19005 ; 24955 ; 40235 ; 26135 ; 35145 ; 26395 ; 27020 ; 27490 ; 38380 ; 21795 ; 32660 ; 26930 ; 25640 ; 24950 ; 27450 ; 20615 ; 28750 ; 33995 ; 24780 ; 32780 ; 28790 ; 23845 ; 31370 ; 23495 ; 28800 ; 52975 ; 36100 ; 18760 ; 20310 ; 40340 ; 41995 ; 17630 ; 20300 ; 20215 ; 22010 ; 33540 ; 14385 ; 16530 ; 25717 ; 29322 ; 25395 ; 14840 ; 22350 ; 19479 ; 26650 ; 24520 ; 12800 ; 16495 ; 25935"
carWidth <- "66 ; 66 ; 69 ; 68 ; 69 ; 67 ; 67 ; 67 ; 67 ; 67 ; 67 ; 67 ; 67 ; 68 ; 66 ; 66 ; 66 ; 68 ; 68 ; 68 ; 72 ; 66 ; 66 ; 68 ; 68 ; 68 ; NA ; 67 ; 67 ; 67 ; 67 ; 67 ; 68 ; 67 ; 67 ; 67 ; 68 ; 68 ; 67 ; 68 ; 68 ; 68 ; 68 ; 67 ; 67 ; 67 ; 65 ; 65 ; 65 ; 73 ; 73 ; 70 ; 70 ; 73 ; 67 ; 67 ; 71 ; 71 ; 75 ; 71 ; 71 ; 67 ; 73 ; 73 ; 71 ; 71 ; 68 ; 67 ; 68 ; 67 ; 72 ; 72 ; 72 ; NA ; 70 ; 73 ; 67 ; 72 ; 67 ; 70 ; 67 ; 70 ; 70 ; 74 ; 68 ; 69 ; 69 ; 69 ; 72 ; 71 ; 71 ; 72 ; 72 ; 68 ; 68 ; 68 ; 68 ; 68 ; 68 ; 69 ; 70 ; 69 ; 74 ; 73 ; 73 ; 73 ; 73 ; 70 ; 73 ; 74 ; 74 ; 74 ; 67 ; 64 ; 75 ; 78 ; 78 ; 72 ; 71 ; 72 ; 72 ; 69 ; 72 ; 70 ; 73 ; 68 ; 68 ; 78 ; 78 ; 73 ; 70 ; 72 ; 70 ; 72 ; 72 ; 70 ; 74 ; 69 ; 69 ; 69 ; 72 ; 71 ; 72 ; 68 ; 68 ; 69 ; 68 ; 72 ; 70 ; 70 ; 70 ; 70 ; 71 ; 71 ; 69 ; 69 ; 69 ; 69 ; 69 ; 69 ; 73 ; 74 ; 75 ; 71 ; 74 ; 69 ; 78 ; 69 ; 70 ; 70 ; 71 ; 68 ; 68 ; 73 ; 73 ; 68 ; 68 ; 68 ; 68 ; 68 ; 78 ; 78 ; 74 ; 69 ; 69 ; 71 ; 71 ; 69 ; 72 ; 69 ; 69 ; 71 ; 71 ; 71 ; 72 ; 72 ; 72 ; 72 ; 70 ; 70 ; 71 ; 71 ; 75 ; 70 ; 69 ; 73 ; 73 ; 75 ; 75 ; 75 ; 74 ; 74 ; 75 ; 70 ; 73 ; 72 ; 72 ; 72 ; 73 ; 73 ; 73 ; 71 ; 71 ; 72 ; 73 ; 73 ; 78 ; 78 ; 78 ; 68 ; 73 ; 73 ; 69 ; 69 ; 71 ; 71 ; 73 ; 73 ; 69 ; 69 ; 75 ; 75 ; 72 ; 72 ; 72 ; 71 ; 78 ; 73 ; 73 ; 73 ; 70 ; 70 ; 70 ; 70 ; 72 ; 74 ; 74 ; 70 ; 75 ; 73 ; 73 ; 72 ; 69 ; 69 ; 71 ; 71 ; 71 ; 71 ; 72 ; 66 ; 66 ; NA ; NA ; 72 ; 72 ; 72 ; 68 ; 68 ; 69 ; 69 ; 70 ; 72 ; 72 ; 73 ; 70 ; 72 ; 70 ; 72 ; 70 ; 70 ; 69 ; 69 ; 68 ; 67 ; 79 ; 73 ; 79 ; 79 ; 76 ; 80 ; 79 ; 75 ; 79 ; 79 ; 81 ; 76 ; 80 ; 79 ; 78 ; 77 ; 73 ; 74 ; 75 ; 74 ; 75 ; 72 ; 77 ; 70 ; 72 ; 73 ; 76 ; 74 ; 76 ; 73 ; 76 ; 71 ; 72 ; 72 ; 74 ; 75 ; 72 ; 74 ; 76 ; 72 ; 70 ; 74 ; 72 ; 76 ; 76 ; 75 ; 67 ; 70 ; 70 ; 72 ; 73 ; 72 ; 67 ; 74 ; 71 ; 72 ; 69 ; 70 ; 67 ; 68 ; 71 ; 70 ; 69 ; 70 ; 79 ; 67 ; 73 ; 76 ; 76 ; 66 ; 68 ; 68 ; 71 ; 71 ; 73 ; 67 ; 74 ; 70 ; 71 ; 69 ; 67 ; 68 ; 69 ; 68 ; 70 ; 68 ; 69 ; 69 ; 68 ; 73 ; 78 ; 72 ; 79 ; 79 ; 79 ; 79 ; 77 ; 78 ; 76 ; 76 ; 75 ; 72 ; 77 ; 78 ; 78 ; 72 ; 72 ; 72 ; 77 ; 77 ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA"
carHwyMPG <- as.integer(strsplit("34 ; 34 ; 37 ; 37 ; 37 ; 36 ; 36 ; 33 ; 36 ; 33 ; 33 ; 38 ; 44 ; 38 ; 33 ; 33 ; 33 ; 34 ; 34 ; 34 ; 30 ; 33 ; 32 ; 32 ; 32 ; 32 ; NA ; 37 ; NA ; NA ; 35 ; 35 ; 33 ; 35 ; 35 ; 35 ; 35 ; 35 ; 38 ; 31 ; 31 ; 31 ; 30 ; 40 ; 40 ; 40 ; 43 ; 39 ; 43 ; 30 ; 32 ; 34 ; 30 ; 32 ; 29 ; 29 ; 30 ; 28 ; 29 ; 28 ; 28 ; 28 ; 27 ; 26 ; 34 ; 34 ; 37 ; 30 ; 51 ; 66 ; 27 ; 27 ; 27 ; NA ; 32 ; 27 ; 34 ; NA ; NA ; 26 ; 28 ; 32 ; 29 ; 30 ; 33 ; 28 ; 28 ; 28 ; 27 ; 33 ; 29 ; 33 ; 29 ; 51 ; 31 ; 31 ; 46 ; 31 ; 31 ; 29 ; 31 ; 29 ; 29 ; 30 ; 28 ; 30 ; 28 ; 32 ; 28 ; 27 ; 29 ; 27 ; 27 ; 30 ; 27 ; 25 ; 25 ; 30 ; 30 ; 26 ; 26 ; 26 ; 26 ; 26 ; 25 ; 30 ; 26 ; 25 ; 25 ; 26 ; 25 ; 26 ; 26 ; 28 ; 28 ; 29 ; 30 ; 28 ; 27 ; 26 ; 29 ; 29 ; 29 ; 30 ; 30 ; 31 ; 29 ; 28 ; 30 ; 28 ; 26 ; 25 ; 27 ; 25 ; 29 ; 27 ; 27 ; 30 ; 30 ; 29 ; 28 ; 29 ; 29 ; 25 ; 27 ; 28 ; 25 ; 26 ; 26 ; 25 ; 29 ; 25 ; 24 ; 26 ; 26 ; 25 ; 25 ; 26 ; 26 ; 27 ; 25 ; 23 ; NA ; 28 ; 28 ; 29 ; 29 ; 26 ; 29 ; 26 ; 25 ; 27 ; 28 ; 25 ; 28 ; 27 ; 24 ; 24 ; 27 ; 25 ; 25 ; 24 ; 24 ; 20 ; 28 ; 30 ; 26 ; 26 ; 26 ; 28 ; 26 ; 26 ; 26 ; 23 ; 23 ; 26 ; 28 ; 24 ; 28 ; 28 ; 24 ; 25 ; 23 ; 25 ; 24 ; 24 ; 25 ; 25 ; 25 ; 21 ; 24 ; 19 ; 26 ; 22 ; 27 ; 20 ; 26 ; 24 ; 29 ; 30 ; NA ; NA ; 28 ; 26 ; 26 ; 24 ; 22 ; 28 ; 28 ; 29 ; 24 ; 23 ; 28 ; 29 ; 25 ; 25 ; 25 ; 25 ; NA ; 29 ; 25 ; 24 ; 25 ; 26 ; 26 ; 26 ; 23 ; 23 ; 23 ; 28 ; 28 ; 25 ; 24 ; 23 ; 21 ; 19 ; 29 ; 22 ; 28 ; 28 ; 26 ; 26 ; 26 ; NA ; 26 ; 24 ; 26 ; 24 ; 29 ; 26 ; 27 ; 24 ; 33 ; 32 ; 18 ; 21 ; 18 ; 18 ; 21 ; NA ; 19 ; 19 ; 19 ; 17 ; 12 ; 20 ; 18 ; 19 ; 17 ; 23 ; 23 ; 22 ; 21 ; 26 ; 21 ; 20 ; 22 ; 21 ; 21 ; 19 ; 16 ; 19 ; 17 ; 24 ; 18 ; 14 ; 17 ; 21 ; 21 ; 19 ; 21 ; 26 ; 18 ; 26 ; 22 ; 21 ; 24 ; 17 ; 20 ; 20 ; 22 ; 23 ; 25 ; 24 ; 26 ; 24 ; 19 ; 16 ; 21 ; 25 ; 27 ; 20 ; 22 ; 27 ; 25 ; 21 ; 26 ; 30 ; 23 ; 33 ; 26 ; 22 ; 19 ; 33 ; 24 ; 25 ; 27 ; 24 ; 26 ; NA ; 25 ; 36 ; 29 ; 34 ; 35 ; 28 ; 28 ; 29 ; 36 ; 30 ; 31 ; 25 ; 29 ; 27 ; 17 ; 26 ; 26 ; 25 ; 26 ; 25 ; 23 ; 20 ; 25 ; 25 ; 22 ; 25 ; 23 ; 26 ; 25 ; 26 ; 26 ; 24 ; 27 ; 27 ; 17 ; 18 ; 23 ; 21 ; 17 ; 19 ; 22 ; 22 ; 21 ; 19 ; 18 ; 29 ; 24 ; 20 ; NA ; 19 ; 29 ; 19 ; 20 ; 18 ; 28 ; 27 ; 20 ; 17", " ; ")[[1]])
## Warning: NAs introduced by coercion
cars <- data.frame(city_mpg=as.integer(strsplit(carCityMPG, " ; ")[[1]]),
suv=as.logical(as.integer(strsplit(carSUV, " ; ")[[1]])),
ncyl=as.integer(strsplit(carNCyl, " ; ")[[1]]),
horsepwr=as.integer(strsplit(carHP, " ; ")[[1]]),
msrp=as.integer(strsplit(carMSRP, " ; ")[[1]]),
width=as.integer(strsplit(carWidth, " ; ")[[1]]),
hwy_mpg=carHwyMPG
)
## Warning in data.frame(city_mpg = as.integer(strsplit(carCityMPG, " ; ")
## [[1]]), : NAs introduced by coercion
## Warning in data.frame(city_mpg = as.integer(strsplit(carCityMPG, " ; ")
## [[1]]), : NAs introduced by coercion
colSums(is.na(cars))
## city_mpg suv ncyl horsepwr msrp width hwy_mpg
## 14 0 0 0 0 28 14
# Learn data structure
str(cars)
## 'data.frame': 428 obs. of 7 variables:
## $ city_mpg: int 28 28 26 26 26 29 29 26 27 26 ...
## $ suv : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ ncyl : int 4 4 4 4 4 4 4 4 4 4 ...
## $ horsepwr: int 103 103 140 140 140 132 132 130 110 130 ...
## $ msrp : int 11690 12585 14610 14810 16385 13670 15040 13270 13730 15460 ...
## $ width : int 66 66 69 68 69 67 67 67 67 67 ...
## $ hwy_mpg : int 34 34 37 37 37 36 36 33 36 33 ...
# Create faceted histogram
ggplot(cars, aes(x = city_mpg)) +
geom_histogram() +
facet_grid(. ~ suv)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 14 rows containing non-finite values (stat_bin).
# Filter cars with 4, 6, 8 cylinders
common_cyl <- filter(cars, ncyl %in% c(4, 6, 8))
# Create box plots of city mpg by ncyl
ggplot(common_cyl, aes(x = as.factor(ncyl), y = city_mpg)) +
geom_boxplot()
## Warning: Removed 11 rows containing non-finite values (stat_boxplot).
# Create overlaid density plots for same data
ggplot(common_cyl, aes(x = city_mpg, fill = as.factor(ncyl))) +
geom_density(alpha = .3)
## Warning: Removed 11 rows containing non-finite values (stat_density).
# Create hist of horsepwr
cars %>%
ggplot(aes(x=horsepwr)) +
geom_histogram() +
ggtitle("Histogram of Horsepower")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Create hist of horsepwr for affordable cars
cars %>%
filter(msrp < 25000) %>%
ggplot(aes(x=horsepwr)) +
geom_histogram() +
xlim(c(90, 550)) +
ggtitle("Histogram of Horsepower\n(Affordable Cars Only)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (stat_bin).
## Warning: Removed 1 rows containing missing values (geom_bar).
# Create hist of horsepwr with binwidth of 3
cars %>%
ggplot(aes(x=horsepwr)) +
geom_histogram(binwidth = 3) +
ggtitle("Histogram of Horsepower\n(Bucket Size=3)")
# Create hist of horsepwr with binwidth of 30
cars %>%
ggplot(aes(x=horsepwr)) +
geom_histogram(binwidth = 30) +
ggtitle("Histogram of Horsepower\n(Bucket Size=30)")
# Create hist of horsepwr with binwidth of 60
cars %>%
ggplot(aes(x=horsepwr)) +
geom_histogram(binwidth = 60) +
ggtitle("Histogram of Horsepower\n(Bucket Size=60)")
# Construct box plot of msrp
cars %>%
ggplot(aes(x = 1, y = msrp)) +
geom_boxplot()
# Exclude outliers from data
cars_no_out <- cars %>%
filter(msrp < 100000)
# Create plot of city_mpg
cars %>%
ggplot(aes(x=city_mpg)) +
geom_density()
## Warning: Removed 14 rows containing non-finite values (stat_density).
# Create plot of width
cars %>%
ggplot(aes(x=width)) +
geom_density()
## Warning: Removed 28 rows containing non-finite values (stat_density).
# Create plot of city_mpg
cars %>%
ggplot(aes(x=factor(1), y=city_mpg)) +
geom_boxplot()
## Warning: Removed 14 rows containing non-finite values (stat_boxplot).
# Create plot of width
cars %>%
ggplot(aes(x=factor(1), y=width)) +
geom_boxplot()
## Warning: Removed 28 rows containing non-finite values (stat_boxplot).
# Facet hists using hwy mileage and ncyl
common_cyl %>%
ggplot(aes(x = hwy_mpg)) +
geom_histogram() +
facet_grid(ncyl ~ suv) +
ggtitle("Histogram of HighwayMPG\n(By Cylinders vs. SUV)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 11 rows containing non-finite values (stat_bin).
Chapter 3 - Numerical summaries
Measures of center - “what is the typical value”?:
Measures of variability - what are the typical distances from “typical”?:
Shape and transformations - modality and skew:
Outliers - observations with extreme values:
Example code includes:
# Create the data assumed for the exercises
data(gapminder, package="gapminder")
gapminder <- tibble::as_tibble(gapminder)
str(gapminder)
## Classes 'tbl_df', 'tbl' and 'data.frame': 1704 obs. of 6 variables:
## $ country : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ year : int 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
## $ lifeExp : num 28.8 30.3 32 34 36.1 ...
## $ pop : int 8425333 9240934 10267083 11537966 13079460 14880372 12881816 13867957 16317921 22227415 ...
## $ gdpPercap: num 779 821 853 836 740 ...
# Create dataset of 2007 data
gap2007 <- filter(gapminder, year == 2007)
# Compute groupwise mean and median lifeExp
gap2007 %>%
group_by(continent) %>%
summarize(mean(lifeExp),
median(lifeExp)
)
## # A tibble: 5 × 3
## continent `mean(lifeExp)` `median(lifeExp)`
## <fctr> <dbl> <dbl>
## 1 Africa 54.80604 52.9265
## 2 Americas 73.60812 72.8990
## 3 Asia 70.72848 72.3960
## 4 Europe 77.64860 78.6085
## 5 Oceania 80.71950 80.7195
# Generate box plots of lifeExp for each continent
gap2007 %>%
ggplot(aes(x = continent, y = lifeExp)) +
geom_boxplot()
# Compute groupwise measures of spread
gap2007 %>%
group_by(continent) %>%
summarize(sd(lifeExp),
IQR(lifeExp),
n()
)
## # A tibble: 5 × 4
## continent `sd(lifeExp)` `IQR(lifeExp)` `n()`
## <fctr> <dbl> <dbl> <int>
## 1 Africa 9.6307807 11.61025 52
## 2 Americas 4.4409476 4.63200 25
## 3 Asia 7.9637245 10.15200 33
## 4 Europe 2.9798127 4.78250 30
## 5 Oceania 0.7290271 0.51550 2
# Generate overlaid density plots
gap2007 %>%
ggplot(aes(x = lifeExp, fill = continent)) +
geom_density(alpha = 0.3)
# Compute stats for lifeExp in Americas
gap2007 %>%
filter(continent == "Americas") %>%
summarize(mean(lifeExp),
sd(lifeExp)
)
## # A tibble: 1 × 2
## `mean(lifeExp)` `sd(lifeExp)`
## <dbl> <dbl>
## 1 73.60812 4.440948
# Compute stats for population
gap2007 %>%
summarize(median(pop),
IQR(pop)
)
## # A tibble: 1 × 2
## `median(pop)` `IQR(pop)`
## <dbl> <dbl>
## 1 10517531 26702008
# Create density plot of old variable
gap2007 %>%
ggplot(aes(x = pop)) +
geom_density()
# Transform the skewed pop variable
gap2007 <- gap2007 %>%
mutate(log_pop = log(pop))
# Create density plot of new variable
gap2007 %>%
ggplot(aes(x = log_pop)) +
geom_density()
# Filter for Asia, add column indicating outliers
gap_asia <- gap2007 %>%
filter(continent == "Asia") %>%
mutate(is_outlier = (lifeExp < 50))
# Remove outliers, create box plot of lifeExp
gap_asia %>%
filter(!is_outlier) %>%
ggplot(aes(x = factor(1), y = lifeExp)) +
geom_boxplot()
Chapter 4 - Case Study
Introducing the data - the email dataset (tibble 3,921 x 21):
Check-in #1:
Check-in #2:
Example code includes:
data(email, package="openintro")
email <- tibble::as_tibble(email)
str(email)
## Classes 'tbl_df', 'tbl' and 'data.frame': 3921 obs. of 21 variables:
## $ spam : num 0 0 0 0 0 0 0 0 0 0 ...
## $ to_multiple : num 0 0 0 0 0 0 1 1 0 0 ...
## $ from : num 1 1 1 1 1 1 1 1 1 1 ...
## $ cc : int 0 0 0 0 0 0 0 1 0 0 ...
## $ sent_email : num 0 0 0 0 0 0 1 1 0 0 ...
## $ time : POSIXct, format: "2012-01-01 00:16:41" "2012-01-01 01:03:59" ...
## $ image : num 0 0 0 0 0 0 0 1 0 0 ...
## $ attach : num 0 0 0 0 0 0 0 1 0 0 ...
## $ dollar : num 0 0 4 0 0 0 0 0 0 0 ...
## $ winner : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ inherit : num 0 0 1 0 0 0 0 0 0 0 ...
## $ viagra : num 0 0 0 0 0 0 0 0 0 0 ...
## $ password : num 0 0 0 0 2 2 0 0 0 0 ...
## $ num_char : num 11.37 10.5 7.77 13.26 1.23 ...
## $ line_breaks : int 202 202 192 255 29 25 193 237 69 68 ...
## $ format : num 1 1 1 1 0 0 1 1 0 1 ...
## $ re_subj : num 0 0 0 0 0 0 0 0 0 0 ...
## $ exclaim_subj: num 0 0 0 0 0 0 0 0 0 0 ...
## $ urgent_subj : num 0 0 0 0 0 0 0 0 0 0 ...
## $ exclaim_mess: num 0 1 6 48 1 1 1 18 1 0 ...
## $ number : Factor w/ 3 levels "none","small",..: 3 2 2 2 1 1 3 2 2 2 ...
# Compute summary statistics
email %>%
group_by(spam) %>%
summarize(median(num_char), IQR(num_char))
## # A tibble: 2 × 3
## spam `median(num_char)` `IQR(num_char)`
## <dbl> <dbl> <dbl>
## 1 0 6.831 13.58225
## 2 1 1.046 2.81800
# Create plot
email %>%
mutate(log_num_char = log(num_char)) %>%
ggplot(aes(x = factor(spam), y = log_num_char)) +
geom_boxplot()
# Create plot for spam and exclaim_mess
email %>% ggplot(aes(x=log(1 + exclaim_mess), fill=factor(spam))) + geom_density(alpha=0.5)
# Create plot of proportion of spam by image
email %>%
mutate(has_image = (image > 0)) %>%
ggplot(aes(x = has_image, fill = factor(spam))) +
geom_bar(position = "fill")
# Do images get counted as attachments?
sum(email$image > email$attach)
## [1] 0
# Question 1
email %>%
filter(dollar > 0) %>%
group_by(spam) %>%
summarize(mean(dollar))
## # A tibble: 2 × 2
## spam `mean(dollar)`
## <dbl> <dbl>
## 1 0 8.211078
## 2 1 3.435897
# Question 2
email %>%
filter(dollar > 10) %>%
ggplot(aes(x = factor(spam))) +
geom_bar()
# Reorder levels
email$number <- factor(email$number, levels=c("none", "small", "big"))
# Construct plot of number
ggplot(email, aes(x=number, fill=factor(spam))) +
geom_bar(position="fill")
Chapter 1 - Introduction to Ideas of Inference
Statistical inference is the process of making claims about a population based on information from a sample of data:
Randomized distributions:
Using the randomization distribution - comparing the observed statistic to the null distribution:
The sample being consistent with the null hypothesis does not “prove” the null hypothesis; you can only “reject” the null hypothesis
Example code includes:
# PROBLEM - I DO NOT HAVE oilabs::rep_sample_n() ; cut/paste to replicate as oilabs_rep_sample_n
# Copied code from https://github.com/OpenIntroOrg/oilabs/blob/master/R/rep_sample_n.R
oilabs_rep_sample_n <- function(tbl, size, replace = FALSE, reps = 1) {
n <- nrow(tbl)
i <- unlist(replicate(reps, sample.int(n, size, replace = replace), simplify = FALSE))
rep_tbl <- cbind(replicate = rep(1:reps,rep(size,reps)), tbl[i,])
dplyr::group_by(rep_tbl, replicate)
}
And, then the actual coding:
data(NHANES, package="NHANES")
# What are the variables in the NHANES dataset?
names(NHANES)
## [1] "ID" "SurveyYr" "Gender"
## [4] "Age" "AgeDecade" "AgeMonths"
## [7] "Race1" "Race3" "Education"
## [10] "MaritalStatus" "HHIncome" "HHIncomeMid"
## [13] "Poverty" "HomeRooms" "HomeOwn"
## [16] "Work" "Weight" "Length"
## [19] "HeadCirc" "Height" "BMI"
## [22] "BMICatUnder20yrs" "BMI_WHO" "Pulse"
## [25] "BPSysAve" "BPDiaAve" "BPSys1"
## [28] "BPDia1" "BPSys2" "BPDia2"
## [31] "BPSys3" "BPDia3" "Testosterone"
## [34] "DirectChol" "TotChol" "UrineVol1"
## [37] "UrineFlow1" "UrineVol2" "UrineFlow2"
## [40] "Diabetes" "DiabetesAge" "HealthGen"
## [43] "DaysPhysHlthBad" "DaysMentHlthBad" "LittleInterest"
## [46] "Depressed" "nPregnancies" "nBabies"
## [49] "Age1stBaby" "SleepHrsNight" "SleepTrouble"
## [52] "PhysActive" "PhysActiveDays" "TVHrsDay"
## [55] "CompHrsDay" "TVHrsDayChild" "CompHrsDayChild"
## [58] "Alcohol12PlusYr" "AlcoholDay" "AlcoholYear"
## [61] "SmokeNow" "Smoke100" "Smoke100n"
## [64] "SmokeAge" "Marijuana" "AgeFirstMarij"
## [67] "RegularMarij" "AgeRegMarij" "HardDrugs"
## [70] "SexEver" "SexAge" "SexNumPartnLife"
## [73] "SexNumPartYear" "SameSex" "SexOrientation"
## [76] "PregnantNow"
# Create bar plot for Home Ownership by Gender
ggplot(NHANES, aes(x = Gender, fill = HomeOwn)) +
geom_bar(position = "fill") +
ylab("Relative frequencies")
# Density for SleepHrsNight colored by SleepTrouble, faceted by HealthGen
ggplot(NHANES, aes(x = SleepHrsNight, col = SleepTrouble)) +
geom_density(adjust = 2) +
facet_wrap(~ HealthGen)
## Warning: Removed 2245 rows containing non-finite values (stat_density).
# Subset the data: homes
homes <- NHANES %>%
select(Gender, HomeOwn) %>%
filter(HomeOwn %in% c("Own", "Rent"))
# Perform one permutation
homes %>%
mutate(HomeOwn_perm = sample(HomeOwn)) %>%
group_by(Gender) %>%
summarize(prop_own_perm = mean(HomeOwn_perm == "Own"),
prop_own = mean(HomeOwn == "Own")) %>%
summarize(diff_perm = diff(prop_own_perm),
diff_orig = diff(prop_own))
## # A tibble: 1 × 2
## diff_perm diff_orig
## <dbl> <dbl>
## 1 0.001644559 -0.007828723
# Perform 10 permutations
homeown_perm <- homes %>%
oilabs_rep_sample_n(size = nrow(homes), reps = 10) %>%
mutate(HomeOwn_perm = sample(HomeOwn)) %>%
group_by(replicate, Gender) %>%
summarize(prop_own_perm = mean(HomeOwn_perm == "Own"),
prop_own = mean(HomeOwn == "Own")) %>%
summarize(diff_perm = diff(prop_own_perm),
diff_orig = diff(prop_own)) # male - female
# Print differences to console
homeown_perm
## # A tibble: 10 × 3
## replicate diff_perm diff_orig
## <int> <dbl> <dbl>
## 1 1 -0.007416841 -0.007828723
## 2 2 0.023886176 -0.007828723
## 3 3 -0.005769314 -0.007828723
## 4 4 0.004939613 -0.007828723
## 5 5 0.005351495 -0.007828723
## 6 6 -0.008240605 -0.007828723
## 7 7 -0.006593078 -0.007828723
## 8 8 -0.001238614 -0.007828723
## 9 9 -0.020185177 -0.007828723
## 10 10 0.007822786 -0.007828723
# Dotplot of 10 permuted differences in proportions
ggplot(homeown_perm, aes(x = diff_perm)) +
geom_dotplot(binwidth = 0.001)
# Perform 100 permutations
homeown_perm <- homes %>%
oilabs_rep_sample_n(nrow(homes), reps=100) %>%
mutate(HomeOwn_perm = sample(HomeOwn)) %>%
group_by(replicate, Gender) %>%
summarize(prop_own_perm = mean(HomeOwn_perm == "Own"),
prop_own = mean(HomeOwn == "Own")) %>%
summarize(diff_perm = diff(prop_own_perm),
diff_orig = diff(prop_own)) # male - female
# Dotplot of 100 permuted differences in proportions
ggplot(homeown_perm, aes(x = diff_perm)) +
geom_dotplot(binwidth = 0.001)
# Perform 1000 permutations
homeown_perm <- homes %>%
oilabs_rep_sample_n(nrow(homes), reps=1000) %>%
mutate(HomeOwn_perm = sample(HomeOwn)) %>%
group_by(replicate, Gender) %>%
summarize(prop_own_perm = mean(HomeOwn_perm == "Own"),
prop_own = mean(HomeOwn == "Own")) %>%
summarize(diff_perm = diff(prop_own_perm),
diff_orig = diff(prop_own)) # male - female
# Density plot of 1000 permuted differences in proportions
ggplot(homeown_perm, aes(x = diff_perm)) +
geom_density()
# Plot permuted differences
ggplot(homeown_perm, aes(x = diff_perm)) +
geom_density() +
geom_vline(aes(xintercept = diff_orig),
col = "red")
# Compare permuted differences to observed difference
homeown_perm %>%
summarize(sum(diff_orig >= diff_perm))
## # A tibble: 1 × 1
## `sum(diff_orig >= diff_perm)`
## <int>
## 1 208
Chapter 2 - Completing a randomization study
Gender discrimination case - promotion case study among bank managers:
Distribution of statistics - different forms of the null hypothesis:
Why 0.05 for the critical region?
What is a p-value?
Example code includes:
discPromote <- "promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted"
discSex <- "male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; female ; female ; female ; female ; female ; female ; female ; female ; female ; female ; female ; female ; female ; female ; male ; male ; male ; female ; female ; female ; female ; female ; female ; female ; female ; female ; female"
disc <- data.frame(promote=factor(strsplit(discPromote, " ; ")[[1]],
levels=c("not_promoted", "promoted")
),
sex=factor(strsplit(discSex, " ; ")[[1]])
)
# Create a contingency table summarizing the data
table(disc$sex, disc$promote)
##
## not_promoted promoted
## female 10 14
## male 3 21
# Find proportion of each sex who were promoted
disc %>%
group_by(sex) %>%
summarize(promoted_prop=mean(promote == "promoted"))
## # A tibble: 2 × 2
## sex promoted_prop
## <fctr> <dbl>
## 1 female 0.5833333
## 2 male 0.8750000
# Sample the entire data frame 5 times
disc %>%
oilabs_rep_sample_n(size = nrow(disc), reps = 5)
## Source: local data frame [240 x 3]
## Groups: replicate [5]
##
## replicate promote sex
## * <int> <fctr> <fctr>
## 1 1 promoted male
## 2 1 promoted female
## 3 1 promoted male
## 4 1 promoted female
## 5 1 promoted female
## 6 1 promoted female
## 7 1 not_promoted male
## 8 1 not_promoted female
## 9 1 promoted male
## 10 1 not_promoted female
## # ... with 230 more rows
# Shuffle the promote variable within replicate
disc %>%
oilabs_rep_sample_n(size = nrow(disc), reps = 5) %>%
mutate(prom_perm = sample(promote))
## Source: local data frame [240 x 4]
## Groups: replicate [5]
##
## replicate promote sex prom_perm
## <int> <fctr> <fctr> <fctr>
## 1 1 promoted female promoted
## 2 1 promoted male not_promoted
## 3 1 promoted female promoted
## 4 1 not_promoted female promoted
## 5 1 promoted female promoted
## 6 1 promoted male promoted
## 7 1 promoted male promoted
## 8 1 promoted female promoted
## 9 1 promoted male not_promoted
## 10 1 promoted male promoted
## # ... with 230 more rows
# Find the proportion of promoted in each replicate and sex
disc %>%
oilabs_rep_sample_n(size = nrow(disc), reps = 5) %>%
mutate(prom_perm = sample(promote)) %>%
group_by(replicate, sex) %>%
summarize(prop_prom_perm = mean(prom_perm == "promoted"),
prop_prom = mean(promote == "promoted"))
## Source: local data frame [10 x 4]
## Groups: replicate [?]
##
## replicate sex prop_prom_perm prop_prom
## <int> <fctr> <dbl> <dbl>
## 1 1 female 0.7916667 0.5833333
## 2 1 male 0.6666667 0.8750000
## 3 2 female 0.6666667 0.5833333
## 4 2 male 0.7916667 0.8750000
## 5 3 female 0.6250000 0.5833333
## 6 3 male 0.8333333 0.8750000
## 7 4 female 0.6666667 0.5833333
## 8 4 male 0.7916667 0.8750000
## 9 5 female 0.7916667 0.5833333
## 10 5 male 0.6666667 0.8750000
# Difference in proportion of promoted across sex grouped by gender
disc %>%
oilabs_rep_sample_n(size = nrow(disc), reps = 5) %>%
mutate(prom_perm = sample(promote)) %>%
group_by(replicate, sex) %>%
summarize(prop_prom_perm = mean(prom_perm == "promoted"),
prop_prom = mean(promote == "promoted")) %>%
summarize(diff_perm = diff(prop_prom_perm),
diff_orig = diff(prop_prom)) # male - female
## # A tibble: 5 × 3
## replicate diff_perm diff_orig
## <int> <dbl> <dbl>
## 1 1 0.12500000 0.2916667
## 2 2 -0.04166667 0.2916667
## 3 3 -0.20833333 0.2916667
## 4 4 -0.04166667 0.2916667
## 5 5 0.29166667 0.2916667
# Create a data frame of differences in promotion rates
disc_perm <- disc %>%
oilabs_rep_sample_n(size = nrow(disc), reps = 1000) %>%
mutate(prom_perm = sample(promote)) %>%
group_by(replicate, sex) %>%
summarize(prop_prom_perm = mean(prom_perm == "promoted"),
prop_prom = mean(promote == "promoted")) %>%
summarize(diff_perm = diff(prop_prom_perm),
diff_orig = diff(prop_prom)) # male - female
# Histogram of permuted differences
ggplot(disc_perm, aes(x = diff_perm)) +
geom_histogram(binwidth = 0.01) +
geom_vline(aes(xintercept = diff_orig), col = "red")
# Find the 0.90, 0.95, and 0.99 quantiles of diff_perm
disc_perm %>%
summarize(q.90 = quantile(diff_perm, p = 0.90),
q.95 = quantile(diff_perm, p = 0.95),
q.99 = quantile(diff_perm, p = 0.99)
)
## # A tibble: 1 × 3
## q.90 q.95 q.99
## <dbl> <dbl> <dbl>
## 1 0.1333333 0.2083333 0.2916667
# Find the 0.10, 0.05, and 0.01 quantiles of diff_perm
disc_perm %>%
summarize(q.01 = quantile(diff_perm, p = 0.01),
q.05 = quantile(diff_perm, p = 0.05),
q.10 = quantile(diff_perm, p = 0.10)
)
## # A tibble: 1 × 3
## q.01 q.05 q.10
## <dbl> <dbl> <dbl>
## 1 -0.2916667 -0.2083333 -0.125
discsmallSex <- "2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 1 ; 1 ; 1 ; 1 ; 1 ; 2 ; 1 ; 1 ; 1" # 2 is male
discbigSex <- "2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1" # 2 is male
discbigPromote <- "2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1" # 2 is promote
discsmallPromote <- "2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 1 ; 1 ; 1 ; 1" # 2 is promote
dsSex <- factor(strsplit(discsmallSex, " ; ")[[1]],
labels=c("female", "male")
)
dbSex <- factor(strsplit(discbigSex, " ; ")[[1]],
labels=c("female", "male")
)
dsPromote <- factor(strsplit(discsmallPromote, " ; ")[[1]],
labels=c("not_promoted", "promoted")
)
dbPromote <- factor(strsplit(discbigPromote, " ; ")[[1]],
labels=c("not_promoted", "promoted")
)
disc_small <- data.frame(sex=dsSex, promote=dsPromote)
disc_big <- data.frame(sex=dbSex, promote=dbPromote)
# Tabulate the small and big data frames
disc_small %>%
select(sex, promote) %>%
table()
## promote
## sex not_promoted promoted
## female 3 5
## male 1 7
disc_big %>%
select(sex, promote) %>%
table()
## promote
## sex not_promoted promoted
## female 100 140
## male 30 210
# Create a 1000 permutation for each
disc_small_perm <- disc_small %>%
oilabs_rep_sample_n(size = nrow(disc_small), reps = 1000) %>%
mutate(prom_perm = sample(promote)) %>%
group_by(replicate, sex) %>%
summarize(prop_prom_perm = mean(prom_perm == "promoted"),
prop_prom = mean(promote == "promoted")) %>%
summarize(diff_perm = diff(prop_prom_perm),
diff_orig = diff(prop_prom)) # male - female
# Create a 1000 permutation for each
disc_big_perm <- disc_big %>%
oilabs_rep_sample_n(size = nrow(disc_big), reps = 1000) %>%
mutate(prom_perm = sample(promote)) %>%
group_by(replicate, sex) %>%
summarize(prop_prom_perm = mean(prom_perm == "promoted"),
prop_prom = mean(promote == "promoted")) %>%
summarize(diff_perm = diff(prop_prom_perm),
diff_orig = diff(prop_prom)) # male - female
# Plot the distributions of permuted differences
ggplot(disc_small_perm, aes(x = diff_perm)) +
geom_histogram(binwidth = 0.01) +
geom_vline(aes(xintercept = diff_orig), col = "red")
ggplot(disc_big_perm, aes(x = diff_perm)) +
geom_histogram(binwidth = 0.01) +
geom_vline(aes(xintercept = diff_orig), col = "red")
# Recall the quantiles associated with the original dataset
disc_perm %>%
summarize(q.90 = quantile(diff_perm, p = 0.90),
q.95 = quantile(diff_perm, p = 0.95),
q.99 = quantile(diff_perm, p = 0.99)
)
## # A tibble: 1 × 3
## q.90 q.95 q.99
## <dbl> <dbl> <dbl>
## 1 0.1333333 0.2083333 0.2916667
# Calculate the quantiles associated with the small dataset
disc_small_perm %>%
summarize(q.90 = quantile(diff_perm, p = 0.90),
q.95 = quantile(diff_perm, p = 0.95),
q.99 = quantile(diff_perm, p = 0.99)
)
## # A tibble: 1 × 3
## q.90 q.95 q.99
## <dbl> <dbl> <dbl>
## 1 0.25 0.25 0.5
# Calculate the quantiles associated with the big dataset
disc_big_perm %>%
summarize(q.90 = quantile(diff_perm, p = 0.90),
q.95 = quantile(diff_perm, p = 0.95),
q.99 = quantile(diff_perm, p = 0.99)
)
## # A tibble: 1 × 3
## q.90 q.95 q.99
## <dbl> <dbl> <dbl>
## 1 0.05833333 0.06666667 0.09166667
# Calculate the p-value for the original dataset
disc_perm %>%
summarize(mean(diff_orig <= diff_perm))
## # A tibble: 1 × 1
## `mean(diff_orig <= diff_perm)`
## <dbl>
## 1 0.023
# Calculate the p-value for the small dataset
disc_small_perm %>%
summarize(mean(diff_orig <= diff_perm))
## # A tibble: 1 × 1
## `mean(diff_orig <= diff_perm)`
## <dbl>
## 1 0.291
# Calculate the p-value for the big dataset
disc_big_perm %>%
summarize(mean(diff_orig <= diff_perm))
## # A tibble: 1 × 1
## `mean(diff_orig <= diff_perm)`
## <dbl>
## 1 0
dnPromote <- "promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted"
dnSex <- "male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; female ; female ; female ; female ; female ; female ; female ; female ; female ; female ; female ; female ; female ; female ; female ; female ; female ; male ; male ; male ; male ; male ; male ; female ; female ; female ; female ; female ; female ; female"
disc_new <- data.frame(promote=factor(strsplit(dnPromote, " ; ")[[1]],
levels=c("not_promoted", "promoted")
),
sex=factor(strsplit(dnSex, " ; ")[[1]])
)
# Create a 1000 permutation for each
disc_perm <- disc %>%
oilabs_rep_sample_n(size = nrow(disc), reps = 1000) %>%
mutate(prom_perm = sample(promote)) %>%
group_by(replicate, sex) %>%
summarize(prop_prom_perm = mean(prom_perm == "promoted"),
prop_prom = mean(promote == "promoted")) %>%
summarize(diff_perm = diff(prop_prom_perm),
diff_orig = diff(prop_prom)) # male - female
disc_new_perm <- disc_new %>%
oilabs_rep_sample_n(size = nrow(disc_new), reps = 1000) %>%
mutate(prom_perm = sample(promote)) %>%
group_by(replicate, sex) %>%
summarize(prop_prom_perm = mean(prom_perm == "promoted"),
prop_prom = mean(promote == "promoted")) %>%
summarize(diff_perm = diff(prop_prom_perm),
diff_orig = diff(prop_prom)) # male - female
# Recall the original data
disc %>%
select(sex, promote) %>%
table()
## promote
## sex not_promoted promoted
## female 10 14
## male 3 21
# Tabulate the new data
disc_new %>%
select(sex, promote) %>%
table()
## promote
## sex not_promoted promoted
## female 7 17
## male 6 18
# Plot the distribution of the original permuted differences
ggplot(disc_perm, aes(x = diff_perm)) +
geom_histogram() +
geom_vline(aes(xintercept = diff_orig), col = "red")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Plot the distribution of the new permuted differences
ggplot(disc_new_perm, aes(x = diff_perm)) +
geom_histogram() +
geom_vline(aes(xintercept = diff_orig), col = "red")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Find the p-value from the original data
disc_perm %>%
summarize(mean(diff_orig <= diff_perm))
## # A tibble: 1 × 1
## `mean(diff_orig <= diff_perm)`
## <dbl>
## 1 0.025
# Find the p-value from the new data
disc_new_perm %>%
summarize(mean(diff_orig <= diff_perm))
## # A tibble: 1 × 1
## `mean(diff_orig <= diff_perm)`
## <dbl>
## 1 0.54
Chapter 3 - Hypothesis Testing Errors
Opportuinity cost - do reminders about saving money encourage students to purchase fewer DVDs? (Frederick et al study):
Errors and their consequences - consequences of various conclusions and associated errors:
Example code includes:
oppDec <- "buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD"
oppGroup <- "control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment"
opportunity <- data.frame(decision=factor(strsplit(oppDec, " ; ")[[1]]),
group=factor(strsplit(oppGroup, " ; ")[[1]])
)
# Tabulate the data
opportunity %>%
select(decision, group) %>%
table()
## group
## decision control treatment
## buyDVD 56 41
## nobuyDVD 19 34
# Find the proportion who bought the DVD in each group
opportunity %>%
group_by(group) %>%
summarize(buy_prop = mean(decision == "buyDVD"))
## # A tibble: 2 × 2
## group buy_prop
## <fctr> <dbl>
## 1 control 0.7466667
## 2 treatment 0.5466667
# Create a barplot
ggplot(opportunity, aes(x = group, fill = decision)) +
geom_bar(position="fill")
# Data frame of differences in purchase rates after permuting
opp_perm <- opportunity %>%
oilabs_rep_sample_n(size = nrow(opportunity), reps = 1000) %>%
mutate(dec_perm = sample(decision)) %>%
group_by(replicate, group) %>%
summarize(prop_buy_perm = mean(dec_perm == "buyDVD"),
prop_buy = mean(decision == "buyDVD")) %>%
summarize(diff_perm = diff(prop_buy_perm),
diff_orig = diff(prop_buy)) # treatment - control
# Histogram of permuted differences
ggplot(opp_perm, aes(x = diff_perm)) +
geom_histogram(binwidth = .005) +
geom_vline(aes(xintercept = diff_orig), col = "red")
# Calculate the p-value
opp_perm %>%
summarize(mean(diff_perm <= diff_orig))
## # A tibble: 1 × 1
## `mean(diff_perm <= diff_orig)`
## <dbl>
## 1 0.01
# Calculate the two-sided p-value
opp_perm %>%
summarize(2*mean(diff_perm <= diff_orig))
## # A tibble: 1 × 1
## `2 * mean(diff_perm <= diff_orig)`
## <dbl>
## 1 0.02
Chapter 4 - Confidence Intervals
Parameters and confidence intervals - research questions can be comparative (hypothesis test) or estimation (confidence intervals):
Bootstrapping:
Variability in p-hat - how far are the sample data from the parameter?
Interpreting CI and technical conditions:
Example code includes:
# Do not have this dataset (30000 x 2 - poll-vote) - 30 votes in each of 1000 samples
voteSum <- c(9, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25)
voteN <- c(1, 7, 10, 27, 42, 90, 101, 143, 151, 136, 129, 79, 43, 25, 13, 3)
voteAll <- integer(0)
for (intCtr in seq_along(voteSum)) {
vecTemp <- rep(0L, 30)
vecTemp[seq_len(voteSum[intCtr])] <- 1L
voteAll <- c(voteAll, rep(vecTemp, times=voteN[intCtr]))
}
voteNum <- sample(1:1000, 1000, replace=FALSE)
# Needs to be a tibble since oilabs_rep_sample_n() has an implied drop=TRUE for data frames
all_polls <- tibble::as_tibble(data.frame(poll=rep(voteNum, each=30),
vote=voteAll
) %>% arrange(poll)
)
# Select one poll from which to resample: one_poll
one_poll <- all_polls %>%
filter(poll == 1) %>%
select(vote)
# Generate 1000 resamples of one_poll: one_poll_boot_30
one_poll_boot_30 <- one_poll %>%
oilabs_rep_sample_n(size = nrow(one_poll), replace = TRUE, reps = 1000)
# Compute p-hat for each poll: ex1_props
ex1_props <- all_polls %>%
group_by(poll) %>%
summarize(prop_yes = mean(vote))
# Compute p-hat* for each resampled poll: ex2_props
ex2_props <- one_poll_boot_30 %>%
group_by(replicate) %>%
summarize(prop_yes = mean(vote))
# Compare variability of p-hat and p-hat*
ex1_props %>% summarize(sd(prop_yes))
## # A tibble: 1 × 1
## `sd(prop_yes)`
## <dbl>
## 1 0.08683128
ex2_props %>% summarize(sd(prop_yes))
## # A tibble: 1 × 1
## `sd(prop_yes)`
## <dbl>
## 1 0.08874373
# Resample from one_poll with n = 3: one_poll_boot_3
one_poll_boot_3 <- one_poll %>%
oilabs_rep_sample_n(3, replace = TRUE, reps = 1000)
# Resample from one_poll with n = 300: one_poll_boot_300
one_poll_boot_300 <- one_poll %>%
oilabs_rep_sample_n(300, replace = TRUE, reps = 1000)
# Compute p-hat* for each resampled poll: ex3_props
ex3_props <- one_poll_boot_3 %>%
summarize(prop_yes = mean(vote))
# Compute p-hat* for each resampled poll: ex4_props
ex4_props <- one_poll_boot_300 %>%
summarize(prop_yes = mean(vote))
# Compare variability of p-hat* for n = 3 vs. n = 300
ex3_props %>% summarize(sd(prop_yes))
## # A tibble: 1 × 1
## `sd(prop_yes)`
## <dbl>
## 1 0.2964122
ex4_props %>% summarize(sd(prop_yes))
## # A tibble: 1 × 1
## `sd(prop_yes)`
## <dbl>
## 1 0.02826066
# Recall the variability of sample proportions
ex1_props %>% summarize(sd(prop_yes))
## # A tibble: 1 × 1
## `sd(prop_yes)`
## <dbl>
## 1 0.08683128
ex2_props %>% summarize(sd(prop_yes))
## # A tibble: 1 × 1
## `sd(prop_yes)`
## <dbl>
## 1 0.08874373
ex3_props %>% summarize(sd(prop_yes))
## # A tibble: 1 × 1
## `sd(prop_yes)`
## <dbl>
## 1 0.2964122
ex4_props %>% summarize(sd(prop_yes))
## # A tibble: 1 × 1
## `sd(prop_yes)`
## <dbl>
## 1 0.02826066
# Create smoothed density curves for all four experiments
ggplot() +
geom_density(data = ex1_props, aes(x = prop_yes), col = "black", bw = .1) +
geom_density(data = ex2_props, aes(x = prop_yes), col = "green", bw = .1) +
geom_density(data = ex3_props, aes(x = prop_yes), col = "red", bw = .1) +
geom_density(data = ex4_props, aes(x = prop_yes), col = "blue", bw = .1)
# Compute proportion of votes for Candidate X: props
props <- all_polls %>%
group_by(poll) %>%
summarize(prop_yes = mean(vote))
# Proportion of polls within 2SE
props %>%
mutate(lower = mean(prop_yes) - 2 * sd(prop_yes),
upper = mean(prop_yes) + 2 * sd(prop_yes),
in_CI = prop_yes > lower & prop_yes < upper) %>%
summarize(mean(in_CI))
## # A tibble: 1 × 1
## `mean(in_CI)`
## <dbl>
## 1 0.966
# Again, set the one sample that was collected
one_poll <- all_polls %>%
filter(poll == 1) %>%
select(vote)
# Compute p-hat from one_poll: p_hat
p_hat <- mean(one_poll$vote)
# Bootstrap to find the SE of p-hat: one_poll_boot
one_poll_boot <- one_poll %>%
oilabs_rep_sample_n(30, replace = TRUE, reps = 1000) %>%
summarize(prop_yes_boot = mean(vote))
# Create an interval of plausible values
one_poll_boot %>%
summarize(lower = p_hat - 2 * sd(prop_yes_boot),
upper = p_hat + 2 * sd(prop_yes_boot)
)
## # A tibble: 1 × 2
## lower upper
## <dbl> <dbl>
## 1 0.3138709 0.6861291
# Find the 2.5% and 97.5% of the p-hat values
one_poll_boot %>%
summarize(q025_prop = quantile(prop_yes_boot, p = 0.025),
q975_prop = quantile(prop_yes_boot, p = 0.975))
## # A tibble: 1 × 2
## q025_prop q975_prop
## <dbl> <dbl>
## 1 0.3333333 0.6666667
# Bootstrap t-confidence interval for comparison
one_poll_boot %>%
summarize(lower = p_hat - 2 * sd(prop_yes_boot),
upper = p_hat + 2 * sd(prop_yes_boot)
)
## # A tibble: 1 × 2
## lower upper
## <dbl> <dbl>
## 1 0.3138709 0.6861291
# Recall the bootstrap t-confidence interval
p_hat <- mean(one_poll$vote)
one_poll_boot %>%
summarize(lower = p_hat - 2 * sd(prop_yes_boot),
upper = p_hat + 2 * sd(prop_yes_boot))
## # A tibble: 1 × 2
## lower upper
## <dbl> <dbl>
## 1 0.3138709 0.6861291
# Collect a sample of 30 observations from the population
one_poll <- as.tbl(data.frame(vote = rbinom(n = 30, 1, .6)))
# Resample the data using samples of size 300 (an incorrect strategy!)
one_poll_boot_300 <- one_poll %>%
oilabs_rep_sample_n(size=300, replace = TRUE, reps = 1000) %>%
summarize(prop_yes_boot = mean(vote))
# Find the endpoints of the the bootstrap t-confidence interval
one_poll_boot_300 %>%
summarize(lower = p_hat - 2 * sd(prop_yes_boot),
upper = p_hat + 2 * sd(prop_yes_boot)
)
## # A tibble: 1 × 2
## lower upper
## <dbl> <dbl>
## 1 0.4497326 0.5502674
# Resample the data using samples of size 3 (an incorrect strategy!)
one_poll_boot_3 <- one_poll %>%
oilabs_rep_sample_n(size=3, replace = TRUE, reps = 1000) %>%
summarize(prop_yes_boot = mean(vote))
# Find the endpoints of the the bootstrap t-confidence interval
one_poll_boot_3 %>%
summarize(lower = p_hat - 2 * sd(prop_yes_boot),
upper = p_hat + 2 * sd(prop_yes_boot)
)
## # A tibble: 1 × 2
## lower upper
## <dbl> <dbl>
## 1 -0.01919018 1.01919
# Collect 30 observations from a population with true proportion of 0.8
one_poll <- as.tbl(data.frame(vote = rbinom(n = 30, size = 1, prob = 0.8)))
# Compute p-hat of new sample: p_hat
p_hat <- mean(one_poll$vote)
# Resample the 30 observations (with replacement)
one_poll_boot <- one_poll %>%
oilabs_rep_sample_n(size=nrow(one_poll), replace = TRUE, reps = 1000) %>%
summarize(prop_yes_boot = mean(vote))
# Calculate the bootstrap t-confidence interval
one_poll_boot %>%
summarize(lower = p_hat - 2 * sd(prop_yes_boot),
upper = p_hat + 2 * sd(prop_yes_boot)
)
## # A tibble: 1 × 2
## lower upper
## <dbl> <dbl>
## 1 0.6534714 0.9465286
# Calculate a 95% bootstrap percentile interval
one_poll_boot %>%
summarize(q025_prop = quantile(prop_yes_boot, p = 0.025),
q975_prop = quantile(prop_yes_boot, p = 0.975))
## # A tibble: 1 × 2
## q025_prop q975_prop
## <dbl> <dbl>
## 1 0.6658333 0.9333333
# Calculate a 99% bootstrap percentile interval
one_poll_boot %>%
summarize(q005_prop = quantile(prop_yes_boot, p = 0.005),
q995_prop = quantile(prop_yes_boot, p = 0.995))
## # A tibble: 1 × 2
## q005_prop q995_prop
## <dbl> <dbl>
## 1 0.5666667 0.9666667
# Calculate a 90% bootstrap percentile interval
one_poll_boot %>%
summarize(q05_prop = quantile(prop_yes_boot, p = 0.05),
q95_prop = quantile(prop_yes_boot, p = 0.95))
## # A tibble: 1 × 2
## q05_prop q95_prop
## <dbl> <dbl>
## 1 0.6666667 0.9
Chapter 1 - Correlation and Regression
Modeling bivariate relationships - relationships between two variables:
Characterizing bivariate relationships:
Outliers - points that do not fit with the rest of the data:
Example code includes:
data(ncbirths, package="openintro")
# Scatterplot of weight vs. weeks
ggplot(ncbirths, aes(x=weeks, y=weight)) +
geom_point()
## Warning: Removed 2 rows containing missing values (geom_point).
# Boxplot of weight vs. weeks
ggplot(data = ncbirths,
aes(x = cut(weeks, breaks = 5), y = weight)) +
geom_boxplot()
# Mammals scatterplot
data(mammals, package="openintro")
ggplot(mammals, aes(x=BodyWt, y=BrainWt)) +
geom_point()
# Baseball player scatterplot
data(mlbBat10, package="openintro")
ggplot(mlbBat10, aes(x=OBP, y=SLG)) +
geom_point()
# Body dimensions scatterplot
data(bdims, package="openintro")
ggplot(bdims, aes(x=hgt, y=wgt, color=factor(sex))) +
geom_point()
# Smoking scatterplot
data(smoking, package="openintro")
ggplot(smoking, aes(x=age, y=amtWeekdays)) +
geom_point()
## Warning: Removed 1270 rows containing missing values (geom_point).
# Scatterplot with coord_trans()
ggplot(data = mammals, aes(x = BodyWt, y = BrainWt)) +
geom_point() +
coord_trans(x = "log10", y = "log10")
# Scatterplot with scale_x_log10() and scale_y_log10()
ggplot(data = mammals, aes(x = BodyWt, y = BrainWt)) +
geom_point() +
scale_x_log10() + scale_y_log10()
# Scatterplot of SLG vs. OBP
mlbBat10 %>%
filter(AB >= 200) %>%
ggplot(aes(x = OBP, y = SLG)) +
geom_point()
# Identify the outlying player
mlbBat10 %>%
filter(AB >= 200, OBP < 0.2)
## name team position G AB R H 2B 3B HR RBI TB BB SO SB CS OBP
## 1 B Wood LAA 3B 81 226 20 33 2 0 4 14 47 6 71 1 0 0.174
## SLG AVG
## 1 0.208 0.146
Chapter 2 - Correlation
Quantifying strength of bivariate relationship - correlation:
Anscombe dataset - synthetic datasets of the problems with correlation (and regression):
Interpretation of correlation - correlation is not causality:
Spurious correlation:
Example code includes:
data(ncbirths, package="openintro")
# Compute correlation
ncbirths %>%
summarize(N = n(), r = cor(weight, mage))
## N r
## 1 1000 0.05506589
# Compute correlation for all non-missing pairs
ncbirths %>%
summarize(N = n(), r = cor(weight, weeks, use = "pairwise.complete.obs"))
## N r
## 1 1000 0.6701013
data(anscombe)
Anscombe <- data.frame(x=as.vector(as.matrix(anscombe[,1:4])),
y=as.vector(as.matrix(anscombe[,5:8])),
id=rep(1:11, times=4),
set=rep(1:4, each=11)
)
ggplot(data = Anscombe, aes(x = x, y = y)) +
geom_point() +
facet_wrap(~ set)
# Compute properties of Anscombe
Anscombe %>%
group_by(set) %>%
summarize(N = n(), mean(x), sd(x), mean(y), sd(y), cor(x, y))
## # A tibble: 4 × 7
## set N `mean(x)` `sd(x)` `mean(y)` `sd(y)` `cor(x, y)`
## <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 11 9 3.316625 7.500909 2.031568 0.8164205
## 2 2 11 9 3.316625 7.500909 2.031657 0.8162365
## 3 3 11 9 3.316625 7.500000 2.030424 0.8162867
## 4 4 11 9 3.316625 7.500909 2.030579 0.8165214
data(mlbBat10, package="openintro")
data(mammals, package="openintro")
data(bdims, package="openintro")
# Correlation for all baseball players
mlbBat10 %>%
summarize(N = n(), r = cor(OBP, SLG))
## N r
## 1 1199 0.8145628
# Correlation for all players with at least 200 ABs
mlbBat10 %>%
filter(AB >= 200) %>%
summarize(N = n(), r = cor(OBP, SLG))
## N r
## 1 329 0.6855364
# Correlation of body dimensions
bdims %>%
group_by(sex) %>%
summarize(N = n(), r = cor(hgt, wgt))
## # A tibble: 2 × 3
## sex N r
## <int> <int> <dbl>
## 1 0 260 0.4310593
## 2 1 247 0.5347418
# Correlation among mammals, with and without log
mammals %>%
summarize(N = n(),
r = cor(BodyWt, BrainWt),
r_log = cor(log(BodyWt), log(BrainWt)))
## N r r_log
## 1 62 0.9341638 0.9595748
# Create a random noise dataset
noise <- data.frame(x=rnorm(1000), y=rnorm(1000), z=rep(1:20, each=50))
# Create faceted scatterplot
noise %>%
ggplot(aes(x=x, y=y)) +
geom_point() +
facet_wrap(~ z)
# Compute correlations for each dataset
noise_summary <- noise %>%
group_by(z) %>%
summarize(N = n(), spurious_cor = cor(x, y))
# Isolate sets with correlations above 0.2 in absolute strength
noise_summary %>%
filter(abs(spurious_cor) > 0.2)
## # A tibble: 2 × 3
## z N spurious_cor
## <int> <int> <dbl>
## 1 17 50 -0.2418963
## 2 18 50 -0.2696328
Chapter 3 - Simple Linear Regression
Visualization of linear models - adjusting the intercept and the slope to best fit the data:
Understanding the linear model: Response = f(Explanatory) + Noise:
Regression vs. regression to the mean (Galton):
Example code includes:
# Scatterplot with regression line
ggplot(data = bdims, aes(x = hgt, y = wgt)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
bdims_summary <- bdims %>%
summarize(N=n(), r=cor(hgt, wgt),
mean_hgt=mean(hgt), sd_hgt=sd(hgt),
mean_wgt=mean(wgt), sd_wgt=sd(wgt)
)
# Print bdims_summary
bdims_summary
## N r mean_hgt sd_hgt mean_wgt sd_wgt
## 1 507 0.7173011 171.1438 9.407205 69.14753 13.34576
# Add slope and intercept
bdims_summary %>%
mutate(slope = r * sd_wgt / sd_hgt,
intercept = mean_wgt - slope*mean_hgt
)
## N r mean_hgt sd_hgt mean_wgt sd_wgt slope intercept
## 1 507 0.7173011 171.1438 9.407205 69.14753 13.34576 1.017617 -105.0113
data(GaltonFamilies, package="HistData")
GaltonUse <- GaltonFamilies %>%
mutate(sex=gender, height=childHeight) %>%
select(family, father, mother, sex, height)
GaltonUse <- GaltonUse %>%
left_join(GaltonUse %>% group_by(family) %>% summarize(nkids=n()), by="family")
Galton_women <- GaltonUse %>%
filter(sex=="female")
Galton_men <- GaltonUse %>%
filter(sex=="male")
# Height of children vs. height of father
ggplot(data = Galton_men, aes(x = father, y = height)) +
geom_point() +
geom_abline(slope = 1, intercept = 0) +
geom_smooth(method = "lm", se = FALSE)
# Height of children vs. height of mother
ggplot(data = Galton_women, aes(x = mother, y = height)) +
geom_point() +
geom_abline(slope = 1, intercept = 0) +
geom_smooth(method = "lm", se = FALSE)
Chapter 4 - Interpreting Regression Models
Interpretation of regression coefficients - UCLA textbook pricing (dataset ‘textbooks’):
Linear model object interpretation:
Using the linear model - residuals can give information about biggest outliers (often interesting):
Example code includes:
# Linear model for weight as a function of height
lm(wgt ~ hgt, data = bdims)
##
## Call:
## lm(formula = wgt ~ hgt, data = bdims)
##
## Coefficients:
## (Intercept) hgt
## -105.011 1.018
# Linear model for SLG as a function of OBP
lm(SLG ~ OBP, data=mlbBat10)
##
## Call:
## lm(formula = SLG ~ OBP, data = mlbBat10)
##
## Coefficients:
## (Intercept) OBP
## 0.009407 1.110323
# Log-linear model for body weight as a function of brain weight
lm(log(BodyWt) ~ log(BrainWt), data=mammals)
##
## Call:
## lm(formula = log(BodyWt) ~ log(BrainWt), data = mammals)
##
## Coefficients:
## (Intercept) log(BrainWt)
## -2.509 1.225
mod <- lm(wgt ~ hgt, data = bdims)
# Show the coefficients
coef(mod)
## (Intercept) hgt
## -105.011254 1.017617
# Show the full output
summary(mod)
##
## Call:
## lm(formula = wgt ~ hgt, data = bdims)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.743 -6.402 -1.231 5.059 41.103
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -105.01125 7.53941 -13.93 <2e-16 ***
## hgt 1.01762 0.04399 23.14 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.308 on 505 degrees of freedom
## Multiple R-squared: 0.5145, Adjusted R-squared: 0.5136
## F-statistic: 535.2 on 1 and 505 DF, p-value: < 2.2e-16
# Mean of weights equal to mean of fitted values?
mean(bdims$wgt) == mean(fitted.values(mod))
## [1] TRUE
# Mean of the residuals
mean(residuals(mod))
## [1] -1.266971e-15
# Create bdims_tidy
bdims_tidy <- broom::augment(mod)
# Glimpse the resulting data frame
glimpse(bdims_tidy)
## Observations: 507
## Variables: 9
## $ wgt <dbl> 65.6, 71.8, 80.7, 72.6, 78.8, 74.8, 86.4, 78.4, 62....
## $ hgt <dbl> 174.0, 175.3, 193.5, 186.5, 187.2, 181.5, 184.0, 18...
## $ .fitted <dbl> 72.05406, 73.37697, 91.89759, 84.77427, 85.48661, 7...
## $ .se.fit <dbl> 0.4320546, 0.4520060, 1.0667332, 0.7919264, 0.81834...
## $ .resid <dbl> -6.4540648, -1.5769666, -11.1975919, -12.1742745, -...
## $ .hat <dbl> 0.002154570, 0.002358152, 0.013133942, 0.007238576,...
## $ .sigma <dbl> 9.312824, 9.317005, 9.303732, 9.301360, 9.312471, 9...
## $ .cooksd <dbl> 5.201807e-04, 3.400330e-05, 9.758463e-03, 6.282074e...
## $ .std.resid <dbl> -0.69413418, -0.16961994, -1.21098084, -1.31269063,...
ben <- data.frame(wgt=74.8, hgt=182.8)
# Print ben
ben
## wgt hgt
## 1 74.8 182.8
# Predict the weight of ben
predict(mod, newdata=ben)
## 1
## 81.00909
# Add the line to the scatterplot
ggplot(data = bdims, aes(x = hgt, y = wgt)) +
geom_point() +
geom_abline(data = as.data.frame(t(coef(mod))),
aes(intercept = `(Intercept)`, slope = hgt),
color = "dodgerblue")
Chapter 5 - Model Fit
Assessing model fit - how well does the regression line fit the underlying data?
Comparing model fits:
Unusual points - leverage and influence:
Dealing with unusual points - managing the impacts of leverage and influence:
Example code includes:
mod <- lm(wgt ~ hgt, data = bdims)
# View summary of model
summary(mod)
##
## Call:
## lm(formula = wgt ~ hgt, data = bdims)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.743 -6.402 -1.231 5.059 41.103
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -105.01125 7.53941 -13.93 <2e-16 ***
## hgt 1.01762 0.04399 23.14 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.308 on 505 degrees of freedom
## Multiple R-squared: 0.5145, Adjusted R-squared: 0.5136
## F-statistic: 535.2 on 1 and 505 DF, p-value: < 2.2e-16
# Compute the mean of the residuals
mean(residuals(mod))
## [1] -1.266971e-15
# Compute RMSE
sqrt(sum(residuals(mod)^2) / df.residual(mod))
## [1] 9.30804
# View model summary
summary(mod)
##
## Call:
## lm(formula = wgt ~ hgt, data = bdims)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.743 -6.402 -1.231 5.059 41.103
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -105.01125 7.53941 -13.93 <2e-16 ***
## hgt 1.01762 0.04399 23.14 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.308 on 505 degrees of freedom
## Multiple R-squared: 0.5145, Adjusted R-squared: 0.5136
## F-statistic: 535.2 on 1 and 505 DF, p-value: < 2.2e-16
bdims_tidy <- broom::augment(mod)
# Compute R-squared
bdims_tidy %>%
summarize(var_y = var(wgt), var_e = var(.resid)) %>%
mutate(R_squared = 1 - var_e/var_y)
## var_y var_e R_squared
## 1 178.1094 86.46839 0.5145208
mod <- lm(SLG ~ OBP, data=filter(mlbBat10, AB >= 10))
# Rank points of high leverage
mod %>%
broom::augment() %>%
arrange(desc(.hat)) %>%
head()
## SLG OBP .fitted .se.fit .resid .hat .sigma
## 1 0.000 0.000 -0.03744579 0.009956861 0.03744579 0.01939493 0.07153050
## 2 0.000 0.000 -0.03744579 0.009956861 0.03744579 0.01939493 0.07153050
## 3 0.000 0.000 -0.03744579 0.009956861 0.03744579 0.01939493 0.07153050
## 4 0.308 0.550 0.69049108 0.009158810 -0.38249108 0.01641049 0.07011360
## 5 0.000 0.037 0.01152451 0.008770891 -0.01152451 0.01504981 0.07154283
## 6 0.038 0.038 0.01284803 0.008739031 0.02515197 0.01494067 0.07153800
## .cooksd .std.resid
## 1 0.0027664282 0.5289049
## 2 0.0027664282 0.5289049
## 3 0.0027664282 0.5289049
## 4 0.2427446800 -5.3943121
## 5 0.0002015398 -0.1624191
## 6 0.0009528017 0.3544561
# Rank influential points
mod %>%
broom::augment() %>%
arrange(desc(.cooksd)) %>%
head()
## SLG OBP .fitted .se.fit .resid .hat .sigma
## 1 0.308 0.550 0.69049108 0.009158810 -0.3824911 0.016410487 0.07011360
## 2 0.833 0.385 0.47211002 0.004190644 0.3608900 0.003435619 0.07028875
## 3 0.800 0.455 0.56475653 0.006186785 0.2352435 0.007488132 0.07101125
## 4 0.379 0.133 0.13858258 0.005792344 0.2404174 0.006563752 0.07098798
## 5 0.786 0.438 0.54225666 0.005678026 0.2437433 0.006307223 0.07097257
## 6 0.231 0.077 0.06446537 0.007506974 0.1665346 0.011024863 0.07127661
## .cooksd .std.resid
## 1 0.24274468 -5.394312
## 2 0.04407145 5.056428
## 3 0.04114818 3.302718
## 4 0.03760256 3.373787
## 5 0.03712042 3.420018
## 6 0.03057912 2.342252
# Create nontrivial_players
nontrivial_players <- filter(mlbBat10, AB >= 10 & OBP < 0.5)
# Fit model to new data
mod_cleaner <- lm(SLG ~ OBP, data=nontrivial_players)
# View model summary
summary(mod_cleaner)
##
## Call:
## lm(formula = SLG ~ OBP, data = nontrivial_players)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.31383 -0.04165 -0.00261 0.03992 0.35819
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.043326 0.009823 -4.411 1.18e-05 ***
## OBP 1.345816 0.033012 40.768 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.07011 on 734 degrees of freedom
## Multiple R-squared: 0.6937, Adjusted R-squared: 0.6932
## F-statistic: 1662 on 1 and 734 DF, p-value: < 2.2e-16
# Visualize new model
ggplot(nontrivial_players, aes(x=OBP, y=SLG)) +
geom_point() +
geom_abline(data = as.data.frame(t(coef(mod_cleaner))),
aes(intercept = `(Intercept)`, slope = OBP),
color = "dodgerblue")
# Visualize new model
ggplot(nontrivial_players, aes(x=OBP, y=SLG)) +
geom_point() +
geom_smooth(method="lm")
# Rank high leverage points
mod %>%
broom::augment() %>%
arrange(desc(.hat), .cooksd) %>%
head()
## SLG OBP .fitted .se.fit .resid .hat .sigma
## 1 0.000 0.000 -0.03744579 0.009956861 0.03744579 0.01939493 0.07153050
## 2 0.000 0.000 -0.03744579 0.009956861 0.03744579 0.01939493 0.07153050
## 3 0.000 0.000 -0.03744579 0.009956861 0.03744579 0.01939493 0.07153050
## 4 0.308 0.550 0.69049108 0.009158810 -0.38249108 0.01641049 0.07011360
## 5 0.000 0.037 0.01152451 0.008770891 -0.01152451 0.01504981 0.07154283
## 6 0.038 0.038 0.01284803 0.008739031 0.02515197 0.01494067 0.07153800
## .cooksd .std.resid
## 1 0.0027664282 0.5289049
## 2 0.0027664282 0.5289049
## 3 0.0027664282 0.5289049
## 4 0.2427446800 -5.3943121
## 5 0.0002015398 -0.1624191
## 6 0.0009528017 0.3544561
Chapter 1 - What is statistical modeling?
Statistical models are summaries of data (can be encapsulations, machine learning, etc.):
R objects for statistical modeling - functions, formulae, and data frames:
Example code includes:
# Copy over the function and its core expression
# .expression <- (100 - 5 * (1 - relig_motivation) * (school == "private")) * 1.15^acad_motivation
test_scores <-function(school = "private", acad_motivation = 0, relig_motivation = 0) {
# eval(.expression)
(100 - 5 * (1 - relig_motivation) * (school == "private")) * 1.15^acad_motivation
}
# Baseline run
test_scores(school = "public", acad_motivation = 0, relig_motivation = 0)
## [1] 100
# Change school input, leaving others at baseline
test_scores(school = "private", acad_motivation = 0, relig_motivation = 0)
## [1] 95
# Change acad_motivation input, leaving others at baseline
test_scores(school = "public", acad_motivation = 1, relig_motivation = 0)
## [1] 115
# Change relig_motivation input, leaving others at baseline
test_scores(school = "public", acad_motivation = 0, relig_motivation = 1)
## [1] 100
# Use results above to estimate output for new inputs
my_prediction <- 100 - 5 + (2 * 0) + (2 * 15)
my_prediction
## [1] 125
# Check prediction by using test_scores() directly
test_scores(school = "private", acad_motivation = 2, relig_motivation = 2)
## [1] 138.8625
# Use data() to load Trucking_jobs
data(Trucking_jobs, package="statisticalModeling")
# View the number rows in Trucking_jobs
nrow(Trucking_jobs)
## [1] 129
# Use names() to find variable names in mosaicData::Riders
names(mosaicData::Riders)
## [1] "date" "day" "highT" "lowT" "hi" "lo" "precip"
## [8] "clouds" "riders" "ct" "weekday" "wday"
# Look at the head() of diamonds
head(ggplot2::diamonds)
## # A tibble: 6 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.20 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
mean_ <- mosaic::mean_
data(AARP, package="statisticalModeling")
# Find the variable names in AARP
names(AARP)
## [1] "Age" "Sex" "Coverage" "Cost"
# Find the mean cost broken down by sex
mosaic::mean(Cost ~ Sex, data = AARP)
## F M
## 47.29778 57.53056
# Create a boxplot using base, lattice, or ggplot2
boxplot(Cost ~ Sex, data=AARP)
# Make a scatterplot using base, lattice, or ggplot2
plot(Cost ~ Age, data=AARP)
Chapter 2 - Designing and Training Models
Modeling is a process rather than a result:
Evaluating models are assessing how well they match to the real-world (underlying data):
Example code includes:
data(Runners, package="statisticalModeling")
# Find the variable names in Runners
names(Runners)
## [1] "age" "net" "gun" "sex"
## [5] "year" "previous" "nruns" "start_position"
# Build models: handicap_model_1, handicap_model_2, handicap_model_3
handicap_model_1 <- lm(net ~ age, data = Runners)
handicap_model_2 <- lm(net ~ sex, data = Runners)
handicap_model_3 <- lm(net ~ age + sex, data = Runners)
# For now, here's a way to visualize the models
statisticalModeling::fmodel(handicap_model_1)
statisticalModeling::fmodel(handicap_model_2)
statisticalModeling::fmodel(handicap_model_3)
# Build rpart model: model_2
model_2 <- rpart::rpart(net ~ age + sex, data=Runners, cp=0.002)
# Examine graph of model_2 (don't change)
statisticalModeling::fmodel(model_2, ~ age + sex)
# DO NOT HAVE THIS DATASET!
# Create run_again_model
# run_again_model <- rpart(runs_again ~ age + sex + net, data=Ran_twice, cp=0.005)
# Visualize the model (don't change)
# fmodel(run_again_model, ~ age + net, data = Ran_twice)
data(AARP, package="statisticalModeling")
# Display the variable names in the AARP data frame
names(AARP)
## [1] "Age" "Sex" "Coverage" "Cost"
# Build a model: insurance_cost_model
insurance_cost_model <- lm(Cost ~ Age + Sex + Coverage, data=AARP)
# Construct a data frame: example_vals
example_vals <- data.frame(Age=60, Sex="F", Coverage=200)
# Predict insurance cost using predict()
predict(insurance_cost_model, newdata=example_vals)
## 1
## 363.637
# Calculate model output using evaluate_model()
statisticalModeling::evaluate_model(insurance_cost_model, data=example_vals)
## Age Sex Coverage model_output
## 1 60 F 200 363.637
# Build a model: insurance_cost_model
insurance_cost_model <- lm(Cost ~ Age + Sex + Coverage, data = AARP)
# Create a data frame: new_inputs_1
new_inputs_1 <- data.frame(Age = c(30, 90), Sex = c("F", "M"),
Coverage = c(0, 100)
)
# Use expand.grid(): new_inputs_2
new_inputs_2 <- expand.grid(Age = c(30, 90), Sex = c("F", "M"),
Coverage = c(0, 100)
)
# Use predict() for new_inputs_1 and new_inputs_2
predict(insurance_cost_model, newdata = new_inputs_1)
## 1 2
## -99.98726 292.88435
predict(insurance_cost_model, newdata = new_inputs_2)
## 1 2 3 4 5 6 7
## -99.98726 101.11503 -89.75448 111.34781 81.54928 282.65157 91.78206
## 8
## 292.88435
# Use evaluate_model() for new_inputs_1 and new_inputs_2
statisticalModeling::evaluate_model(insurance_cost_model, data = new_inputs_1)
## Age Sex Coverage model_output
## 1 30 F 0 -99.98726
## 2 90 M 100 292.88435
statisticalModeling::evaluate_model(insurance_cost_model, data = new_inputs_2)
## Age Sex Coverage model_output
## 1 30 F 0 -99.98726
## 2 90 F 0 101.11503
## 3 30 M 0 -89.75448
## 4 90 M 0 111.34781
## 5 30 F 100 81.54928
## 6 90 F 100 282.65157
## 7 30 M 100 91.78206
## 8 90 M 100 292.88435
# Evaluate insurance_cost_model
statisticalModeling::evaluate_model(insurance_cost_model)
## Age Sex Coverage model_output
## 1 40 F 0 -66.4702087
## 2 60 F 0 0.5638866
## 3 80 F 0 67.5979818
## 4 40 M 0 -56.2374309
## 5 60 M 0 10.7966643
## 6 80 M 0 77.8307596
## 7 40 F 50 24.2980606
## 8 60 F 50 91.3321558
## 9 80 F 50 158.3662510
## 10 40 M 50 34.5308383
## 11 60 M 50 101.5649336
## 12 80 M 50 168.5990288
# Use fmodel() to reproduce the graphic
statisticalModeling::fmodel(insurance_cost_model, ~ Coverage + Age + Sex)
# A new formula to highlight difference in sexes
new_formula <- ~ Coverage + Sex + Age
# Make the new plot (don't change)
statisticalModeling::fmodel(insurance_cost_model, new_formula)
Chapter 3 - Assessing Prediction Performance
Choosing explanatory variables - depends on the intended purpose for the statistical model:
Cross validation - divide the data in to two non-overlapping datasets, train and test:
Example code includes:
runIDs <- c( 5035 , 10 , 9271 , 256 , 1175 , 17334 , 1571 , 5264 , 15985 , 2237 , 3178 , 7999 , 16462 , 15443 , 13318 , 10409 , 8741 , 5998 , 2860 , 8710 , 3695 , 12340 , 6598 , 6354 , 1125 , 8759 , 7238 , 294 , 2268 , 7219 , 9154 , 5940 , 7464 , 3669 , 14729 , 11636 , 5018 , 1877 , 4639 , 1049 , 4484 , 3896 , 8944 , 11838 , 5960 , 15648 , 11552 , 250 , 9584 , 15110 , 9106 , 10824 , 7706 , 5653 , 4018 , 8028 , 7468 , 14766 , 2945 , 10805 , 2439 , 13616 , 3151 , 10493 , 13595 , 3308 , 1038 , 9019 , 3477 , 11211 , 12410 , 7697 , 7709 , 3699 , 16979 , 9688 , 4891 , 6010 , 6582 , 3983 , 920 , 8972 , 9185 , 4265 , 14708 , 7575 , 3459 , 11727 , 14696 , 4075 , 6604 , 13815 , 260 , 8606 , 14643 , 4323 , 13826 , 3487 , 10602 , 4029 )
runAge <- c( 54 , 27 , 24 , 39 , 52 , 28 , 33 , 40 , 32 , 33 , 30 , 58 , 33 , 46 , 34 , 35 , 50 , 60 , 30 , 28 , 30 , 29 , 56 , 43 , 62 , 60 , 37 , 48 , 27 , 32 , 53 , 43 , 41 , 33 , 29 , 49 , 29 , 24 , 45 , 34 , 56 , 51 , 41 , 38 , 33 , 29 , 34 , 31 , 35 , 43 , 29 , 30 , 30 , 33 , 33 , 46 , 45 , 51 , 32 , 44 , 37 , 46 , 28 , 31 , 51 , 40 , 44 , 28 , 48 , 28 , 44 , 58 , 27 , 33 , 42 , 45 , 36 , 37 , 26 , 47 , 39 , 38 , 36 , 66 , 50 , 31 , 34 , 26 , 53 , 44 , 45 , 24 , 33 , 34 , 50 , 31 , 54 , 38 , 31 , 40 )
runNet <- c( 90 , 74.22 , 90.85 , 91.7 , 94.13 , 99.13 , 78.98 , 102.6 , 111.6 , 100.9 , 81.37 , 82.63 , 83.32 , 71.17 , 73.62 , 79.32 , 111.5 , 86.62 , 111.3 , 69.7 , 66.5 , 65.52 , 99.38 , 89.52 , 76.23 , 79.2 , 59.88 , 124.5 , 107.5 , 105.5 , 78.1 , 99.22 , 96.68 , 59.25 , 94.75 , 93.45 , 76.15 , 91.53 , 75.07 , 80.9 , 94.18 , 97.57 , 86.73 , 92.77 , 99.67 , 85.38 , 65.97 , 77.38 , 94.42 , 78.92 , 87.03 , 97.78 , 86.82 , 113.1 , 88.58 , 74.05 , 88.52 , 83.73 , 81.4 , 69 , 78.43 , 101.2 , 81.2 , 84.45 , 105.1 , 70.38 , 83.28 , 106.5 , 79.12 , 69.83 , 73.35 , 66.07 , 86.23 , 76.72 , 91.88 , 79.12 , 81.63 , 79.67 , 86.62 , 71.63 , 99.28 , 90.58 , 101.2 , 95.8 , 77.58 , 102.4 , 79.67 , 111.2 , 76.88 , 104.4 , 117.4 , 86.68 , 94.78 , 86.1 , 79.63 , 79.23 , 94.97 , 85.67 , 97.07 , 83.15 )
runGun <- c( 90.28 , 75.08 , 93.55 , 95.18 , 99.4 , 105.6 , 81.5 , 107.8 , 116.6 , 104.6 , 82.18 , 82.95 , 84.32 , 71.32 , 74.68 , 80.52 , 114.8 , 87.05 , 115.6 , 70.17 , 66.75 , 66.07 , 105.2 , 95.63 , 81.27 , 80.13 , 60.02 , 125.1 , 107.5 , 110 , 78.53 , 109.6 , 102.5 , 59.43 , 101.1 , 100.3 , 76.47 , 96.98 , 76.43 , 82.45 , 97.8 , 103.6 , 89.53 , 93.63 , 104.5 , 89.73 , 66.25 , 78.62 , 99.47 , 79.15 , 91.13 , 105.4 , 89.85 , 117.8 , 89.45 , 74.93 , 89.2 , 87.32 , 87.9 , 69.13 , 79.97 , 111 , 84.5 , 85.55 , 110.5 , 74.15 , 83.58 , 114.7 , 79.62 , 70.42 , 73.85 , 66.3 , 92.37 , 77.53 , 98.77 , 79.65 , 85.17 , 85.67 , 92.68 , 72.15 , 107.6 , 96.18 , 103.4 , 99.55 , 78.85 , 107 , 81.42 , 114.4 , 77.85 , 108.5 , 121.7 , 92.68 , 96.87 , 88.08 , 80.43 , 79.93 , 99.3 , 90.47 , 102.3 , 84.75 )
runSex <- c( 'F' , 'M' , 'F' , 'F' , 'M' , 'M' , 'M' , 'F' , 'F' , 'F' , 'M' , 'M' , 'M' , 'M' , 'M' , 'F' , 'M' , 'M' , 'F' , 'M' , 'M' , 'M' , 'F' , 'M' , 'F' , 'M' , 'M' , 'F' , 'F' , 'F' , 'F' , 'F' , 'M' , 'M' , 'F' , 'M' , 'F' , 'F' , 'M' , 'M' , 'M' , 'M' , 'F' , 'F' , 'F' , 'M' , 'M' , 'M' , 'M' , 'M' , 'F' , 'F' , 'M' , 'M' , 'M' , 'M' , 'M' , 'M' , 'M' , 'F' , 'M' , 'F' , 'M' , 'F' , 'M' , 'M' , 'M' , 'F' , 'M' , 'M' , 'M' , 'M' , 'M' , 'M' , 'M' , 'M' , 'F' , 'M' , 'M' , 'M' , 'F' , 'F' , 'F' , 'M' , 'M' , 'F' , 'M' , 'F' , 'M' , 'F' , 'F' , 'M' , 'F' , 'M' , 'M' , 'F' , 'M' , 'M' , 'F' , 'M' )
runYear <- c( 2004 , 2001 , 2000 , 2004 , 2005 , 2003 , 2002 , 2001 , 2004 , 2005 , 2005 , 2005 , 2002 , 2004 , 2003 , 2005 , 2005 , 2002 , 2006 , 2006 , 2005 , 2003 , 2004 , 2003 , 2003 , 2003 , 2003 , 2006 , 2004 , 2002 , 2005 , 2006 , 2004 , 2005 , 2004 , 2002 , 2002 , 2004 , 2004 , 2002 , 2001 , 2004 , 2001 , 2002 , 2003 , 2005 , 2004 , 2001 , 2005 , 2003 , 2004 , 2004 , 2003 , 2002 , 2005 , 2002 , 2000 , 2001 , 2005 , 2006 , 2004 , 2006 , 2000 , 2004 , 2002 , 2002 , 2004 , 2006 , 2004 , 2002 , 2005 , 2000 , 2005 , 2003 , 2004 , 2003 , 2005 , 2003 , 2005 , 2004 , 2005 , 2001 , 2000 , 2000 , 2001 , 2002 , 2005 , 2004 , 2006 , 2001 , 2005 , 2005 , 2003 , 2001 , 2005 , 2000 , 2002 , 2004 , 2004 , 2006 )
runPrevious <- c( 5 , 1 , 0 , 1 , 1 , 1 , 1 , 1 , 2 , 2 , 4 , 5 , 0 , 5 , 1 , 0 , 3 , 3 , 0 , 2 , 1 , 0 , 1 , 1 , 4 , 1 , 0 , 4 , 2 , 1 , 4 , 1 , 1 , 4 , 1 , 1 , 1 , 1 , 0 , 2 , 2 , 1 , 1 , 1 , 0 , 2 , 2 , 2 , 2 , 1 , 2 , 1 , 0 , 1 , 1 , 0 , 1 , 0 , 3 , 1 , 1 , 1 , 1 , 3 , 2 , 1 , 5 , 1 , 5 , 0 , 6 , 1 , 1 , 2 , 2 , 1 , 3 , 0 , 0 , 1 , 0 , 1 , 1 , 1 , 2 , 1 , 1 , 1 , 0 , 1 , 3 , 1 , 0 , 1 , 0 , 1 , 0 , 3 , 1 , 4 )
runNRuns <- c( 9 , 8 , 4 , 3 , 4 , 5 , 4 , 6 , 3 , 4 , 6 , 6 , 4 , 8 , 4 , 3 , 7 , 8 , 3 , 4 , 3 , 4 , 6 , 4 , 5 , 3 , 3 , 5 , 4 , 4 , 6 , 4 , 5 , 6 , 4 , 4 , 3 , 3 , 5 , 8 , 7 , 5 , 8 , 3 , 3 , 4 , 5 , 5 , 3 , 5 , 3 , 4 , 4 , 3 , 3 , 3 , 4 , 3 , 5 , 4 , 4 , 4 , 5 , 6 , 5 , 3 , 10 , 4 , 9 , 5 , 7 , 3 , 4 , 5 , 4 , 4 , 6 , 5 , 4 , 3 , 3 , 3 , 9 , 6 , 3 , 3 , 3 , 4 , 3 , 7 , 4 , 3 , 5 , 6 , 3 , 4 , 3 , 4 , 3 , 6 )
runStart_Position <- c( 'eager' , 'eager' , 'calm' , 'mellow' , 'mellow' , 'mellow' , 'calm' , 'mellow' , 'mellow' , 'mellow' , 'eager' , 'eager' , 'eager' , 'eager' , 'calm' , 'calm' , 'mellow' , 'eager' , 'mellow' , 'eager' , 'eager' , 'eager' , 'mellow' , 'mellow' , 'mellow' , 'eager' , 'eager' , 'eager' , 'eager' , 'mellow' , 'eager' , 'mellow' , 'mellow' , 'eager' , 'mellow' , 'mellow' , 'eager' , 'mellow' , 'calm' , 'calm' , 'mellow' , 'mellow' , 'calm' , 'eager' , 'mellow' , 'mellow' , 'eager' , 'calm' , 'mellow' , 'eager' , 'mellow' , 'mellow' , 'mellow' , 'mellow' , 'eager' , 'eager' , 'eager' , 'mellow' , 'mellow' , 'eager' , 'calm' , 'mellow' , 'mellow' , 'calm' , 'mellow' , 'mellow' , 'eager' , 'mellow' , 'eager' , 'eager' , 'eager' , 'eager' , 'mellow' , 'eager' , 'mellow' , 'eager' , 'mellow' , 'mellow' , 'mellow' , 'eager' , 'mellow' , 'mellow' , 'calm' , 'mellow' , 'calm' , 'mellow' , 'calm' , 'mellow' , 'eager' , 'mellow' , 'mellow' , 'mellow' , 'calm' , 'calm' , 'eager' , 'eager' , 'mellow' , 'mellow' , 'mellow' , 'calm' )
Runners_100 <- data.frame(age=as.integer(runAge),
net=runNet,
gun=runGun,
sex=runSex,
year=as.integer(runYear),
previous=as.integer(runPrevious),
nruns=as.integer(runNRuns),
start_position=runStart_Position,
orig.id=as.integer(runIDs),
stringsAsFactors=FALSE
)
str(Runners_100)
## 'data.frame': 100 obs. of 9 variables:
## $ age : int 54 27 24 39 52 28 33 40 32 33 ...
## $ net : num 90 74.2 90.8 91.7 94.1 ...
## $ gun : num 90.3 75.1 93.5 95.2 99.4 ...
## $ sex : chr "F" "M" "F" "F" ...
## $ year : int 2004 2001 2000 2004 2005 2003 2002 2001 2004 2005 ...
## $ previous : int 5 1 0 1 1 1 1 1 2 2 ...
## $ nruns : int 9 8 4 3 4 5 4 6 3 4 ...
## $ start_position: chr "eager" "eager" "calm" "mellow" ...
## $ orig.id : int 5035 10 9271 256 1175 17334 1571 5264 15985 2237 ...
# Build a model of net running time
base_model <- lm(net ~ age + sex, data = Runners_100)
# Evaluate base_model on the training data
base_model_output <- predict(base_model, newdata = Runners_100)
# Build the augmented model
aug_model <- lm(net ~ age + sex + previous, data = Runners_100)
# Evaluate aug_model on the training data
aug_model_output <- predict(aug_model, newdata = Runners_100)
# How much do the model outputs differ?
mean((base_model_output - aug_model_output) ^ 2, na.rm = TRUE)
## [1] 0.5157921
# Build and evaluate the base model on Runners_100
base_model <- lm(net ~ age + sex, data = Runners_100)
base_model_output <- predict(base_model, newdata = Runners_100)
# Build and evaluate the augmented model on Runners_100
aug_model <- lm(net ~ age + sex + previous, data=Runners_100)
aug_model_output <- predict(aug_model, newdata = Runners_100)
# Find the case-by-case differences
base_model_differences <- with(Runners_100, net - base_model_output)
aug_model_differences <- with(Runners_100, net - aug_model_output)
# Calculate mean square errors
mean(base_model_differences ^ 2)
## [1] 131.5594
mean(aug_model_differences ^ 2)
## [1] 131.0436
data(CPS85, package="mosaicData")
# Add bogus column to CPS85 (don't change)
CPS85$bogus <- rnorm(nrow(CPS85)) > 0
# Make the base model
base_model <- lm(wage ~ educ + sector + sex, data = CPS85)
# Make the bogus augmented model
aug_model <- lm(wage ~ educ + sector + sex + bogus, data = CPS85)
# Find the MSE of the base model
mean((CPS85$wage - predict(base_model, newdata = CPS85)) ^ 2)
## [1] 19.73308
# Find the MSE of the augmented model
mean((CPS85$wage - predict(aug_model, newdata = CPS85)) ^ 2)
## [1] 19.5078
# Generate a random TRUE or FALSE for each case in Runners_100
Runners_100$training_cases <- rnorm(nrow(Runners_100)) > 0
# Build base model net ~ age + sex with training cases
base_model <-
lm(net ~ age + sex, data = subset(Runners_100, training_cases))
# Evaluate the model for the testing cases
Preds <-
statisticalModeling::evaluate_model(base_model, data = subset(Runners_100, !training_cases))
# Calculate the MSE on the testing data
with(data = Preds, mean((net - model_output)^2))
## [1] 157.0097
# The model
model <- lm(net ~ age + sex, data = Runners_100)
# Find the in-sample error (using the training data)
in_sample <- statisticalModeling::evaluate_model(model, data = Runners_100)
in_sample_error <-
with(in_sample, mean((net - model_output)^2, na.rm = TRUE))
# Calculate MSE for many different trials
trials <- statisticalModeling::cv_pred_error(model)
# View the cross-validated prediction errors
trials
## mse model
## 1 138.1343 model
## 2 143.1356 model
## 3 142.1734 model
## 4 142.8534 model
## 5 137.5054 model
# Find confidence interval on trials and compare to training_error
mosaic::t.test(~ mse, mu = in_sample_error, data = trials)
##
## One Sample t-test
##
## data: trials$mse
## t = 7.5746, df = 4, p-value = 0.001629
## alternative hypothesis: true mean is not equal to 131.5594
## 95 percent confidence interval:
## 137.3878 144.1330
## sample estimates:
## mean of x
## 140.7604
# The base model
base_model <- lm(net ~ age + sex, data = Runners_100)
# An augmented model adding previous as an explanatory variable
aug_model <- lm(net ~ age + sex + previous, data = Runners_100)
# Run cross validation trials on the two models
trials <- statisticalModeling::cv_pred_error(base_model, aug_model)
# Compare the two sets of cross-validated errors
t.test(mse ~ model, data = trials)
##
## Welch Two Sample t-test
##
## data: mse by model
## t = 1.6086, df = 7.0388, p-value = 0.1515
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.228738 6.476066
## sample estimates:
## mean in group aug_model mean in group base_model
## 142.0658 139.4421
Chapter 4 - Exploring data with models
Prediction error for categorical variables:
Exploring data for relationships - example of the NHANES data from library(NHANES):
Example code includes:
data(Runners, package="statisticalModeling")
# Build the null model with rpart()
Runners$all_the_same <- 1 # null "explanatory" variable
null_model <- rpart::rpart(start_position ~ all_the_same, data = Runners)
# Evaluate the null model on training data
null_model_output <- statisticalModeling::evaluate_model(null_model, data = Runners, type = "class")
# Calculate the error rate
with(data = null_model_output, mean(start_position != model_output, na.rm = TRUE))
## [1] 0.5853618
# Generate a random guess...
null_model_output$random_guess <- mosaic::shuffle(Runners$start_position)
# ...and find the error rate
with(data = null_model_output, mean(start_position != random_guess, na.rm = TRUE))
## [1] 0.6498309
# Train the model
model <- rpart::rpart(start_position ~ age + sex, data = Runners, cp = 0.001)
# Get model output with the training data as input
model_output <- statisticalModeling::evaluate_model(model, data = Runners, type = "class")
# Find the error rate
with(data = model_output, mean(start_position != model_output, na.rm = TRUE))
## [1] 0.5567794
# Do not have this data (should be 93x11 for Training_data and 107x11 for Testing_data) - orig.id, all_the_same, training_case
trainData <- c( 14340 , 1667 , 14863 , 15211 , 685 , 16629 , 16620 , 683 , 9695 , 4281 , 15395 , 17308 , 14847 , 2405 , 15696 , 6351 , 10266 , 14345 , 1145 , 9968 , 3409 , 3798 , 4209 , 2084 , 15561 , 7700 , 8620 , 17266 , 1638 , 13963 , 8621 , 14871 , 2945 , 14359 , 9723 , 10371 , 14271 , 826 , 4843 , 15191 , 14171 , 11845 , 15223 , 9213 , 4913 , 8194 , 15509 , 4562 , 15231 , 14317 , 2933 , 2866 , 15242 , 11343 , 15388 , 1104 , 13734 , 17186 , 5427 , 16100 , 5262 , 5873 , 5067 , 1073 , 3164 , 2164 , 1292 , 12337 , 13895 , 4379 , 11012 , 11872 , 10098 , 1130 , 1357 , 6150 , 493 , 7858 , 8761 , 18014 , 445 , 4207 , 15893 , 17022 , 703 , 17615 , 12517 , 181 , 9864 , 8611 , 4171 , 1732 , 11067 )
testData <- c( 16376 , 1316 , 15357 , 8699 , 13896 , 12064 , 13525 , 11807 , 13152 , 4473 , 12926 , 1134 , 7664 , 6597 , 17254 , 5991 , 17042 , 2701 , 2509 , 13264 , 10998 , 10482 , 7534 , 351 , 5866 , 18107 , 18046 , 15454 , 10602 , 10974 , 6988 , 7771 , 8223 , 14225 , 4409 , 2361 , 11462 , 4987 , 8440 , 2483 , 14984 , 14880 , 311 , 7505 , 4371 , 2434 , 15410 , 16068 , 16252 , 5942 , 8123 , 15375 , 15016 , 2379 , 7099 , 5664 , 11381 , 10688 , 1525 , 5506 , 4900 , 16574 , 14272 , 13912 , 3779 , 14584 , 15809 , 2908 , 16329 , 12042 , 1621 , 9248 , 5738 , 1345 , 6319 , 12575 , 3805 , 2895 , 15004 , 9918 , 11422 , 3592 , 10136 , 5941 , 12274 , 14178 , 4667 , 3393 , 11801 , 3814 , 8244 , 11721 , 14940 , 2572 , 14719 , 11398 , 13704 , 17989 , 12056 , 8215 , 8894 , 8303 , 7816 , 14698 , 17293 , 469 , 3533 )
Testing_data <- Runners[complete.cases(Runners), ][testData, ] %>%
mutate(orig.id=as.character(testData), all_the_same=1, training_case=FALSE)
Training_data <- Runners[complete.cases(Runners), ][trainData, ] %>%
mutate(orig.id=as.character(trainData), all_the_same=1, training_case=TRUE)
# Train the models
null_model <- rpart::rpart(start_position ~ all_the_same,
data = Training_data, cp = 0.001)
model_1 <- rpart::rpart(start_position ~ age,
data = Training_data, cp = 0.001)
model_2 <- rpart::rpart(start_position ~ age + sex,
data = Training_data, cp = 0.001)
# Find the out-of-sample error rate
null_output <- statisticalModeling::evaluate_model(null_model, data = Testing_data, type = "class")
model_1_output <- statisticalModeling::evaluate_model(model_1, data = Testing_data, type = "class")
model_2_output <- statisticalModeling::evaluate_model(model_2, data = Testing_data, type = "class")
# Calculate the error rates
null_rate <- with(data = null_output,
mean(start_position != model_output, na.rm = TRUE))
model_1_rate <- with(data = model_1_output,
mean(start_position != model_output, na.rm = TRUE))
model_2_rate <- with(data = model_2_output,
mean(start_position != model_output, na.rm = TRUE))
# Display the error rates
null_rate
## [1] 0.5233645
model_1_rate
## [1] 0.588785
model_2_rate
## [1] 0.5700935
model_2 <- rpart::rpart(net ~ age + sex, data = Runners, cp = 0.001)
rpart.plot::prp(model_2, type = 3)
data(Birth_weight, package="statisticalModeling")
model_1 <- rpart::rpart(baby_wt ~ smoke + income,
data = Birth_weight)
model_2 <- rpart::rpart(baby_wt ~ mother_age + mother_wt,
data = Birth_weight)
rpart.plot::prp(model_1, type = 3)
rpart.plot::prp(model_2, type = 3)
model_3 <- rpart::rpart(baby_wt ~ smoke + income + mother_age + mother_wt, data=Birth_weight)
rpart.plot::prp(model_3, type=3)
model_full <- rpart::rpart(baby_wt ~ ., data=Birth_weight)
rpart.plot::prp(model_full, type=3)
model_gest <- rpart::rpart(gestation ~ . -baby_wt, data=Birth_weight)
rpart.plot::prp(model_gest, type=3)
Chapter 5 - Covariates and Effect Size
Covariates and uses for models - making predictions with available data, exploring a large/complex dataset, anticipate outcome of intervention:
Effect size - how much does the model output change for a given change in the input?
Example code includes:
data(Houses_for_sale, package="statisticalModeling")
# Train the model price ~ fireplaces
simple_model <- lm(price ~ fireplaces, data = Houses_for_sale)
# Evaluate simple_model
statisticalModeling::evaluate_model(simple_model)
## fireplaces model_output
## 1 0 171823.9
## 2 1 238522.7
naive_worth <- 238522.7 - 171823.9
naive_worth
## [1] 66698.8
# Train another model including living_area
sophisticated_model <-lm(price ~ fireplaces + living_area, data = Houses_for_sale)
# Evaluate that model
statisticalModeling::evaluate_model(sophisticated_model)
## fireplaces living_area model_output
## 1 0 1000 124043.6
## 2 1 1000 133006.1
## 3 0 2000 233357.1
## 4 1 2000 242319.5
## 5 0 3000 342670.6
## 6 1 3000 351633.0
# Find price difference for fixed living_area
sophisticated_worth <- 242319.5 - 233357.1
sophisticated_worth
## [1] 8962.4
data(Crime, package="statisticalModeling")
# Train model_1 and model_2
model_1 <- lm(R ~ X, data = Crime)
model_2 <- lm(R ~ W, data = Crime)
# Evaluate each model...
statisticalModeling::evaluate_model(model_1)
## X model_output
## 1 100 106.82223
## 2 200 89.46721
## 3 300 72.11219
statisticalModeling::evaluate_model(model_2)
## W model_output
## 1 400 68.32909
## 2 600 103.70777
## 3 800 139.08644
change_with_X <- 89.46721 - 106.82223
change_with_X
## [1] -17.35502
change_with_W <- 103.70777 - 68.32909
change_with_W
## [1] 35.37868
# Train model_3 using both X and W as explanatory variables
model_3 <- lm(R ~ X + W, data = Crime)
# Evaluate model_3
statisticalModeling::evaluate_model(model_3)
## X W model_output
## 1 100 400 -62.60510
## 2 200 400 31.03422
## 3 300 400 124.67354
## 4 100 600 41.22502
## 5 200 600 134.86434
## 6 300 600 228.50366
## 7 100 800 145.05515
## 8 200 800 238.69447
## 9 300 800 332.33379
# Find the difference in output for each of X and W
change_with_X_holding_W_constant <- 134.86434 - 228.50366
change_with_X_holding_W_constant
## [1] -93.63932
change_with_W_holding_X_constant <- 134.86434 - 31.03422
change_with_W_holding_X_constant
## [1] 103.8301
data(Trucking_jobs, package="statisticalModeling")
# Train the five models
model_1 <- lm(earnings ~ sex, data = Trucking_jobs)
model_2 <- lm(earnings ~ sex + age, data = Trucking_jobs)
model_3 <- lm(earnings ~ sex + hiredyears, data = Trucking_jobs)
model_4 <- lm(earnings ~ sex + title, data = Trucking_jobs)
model_5 <- lm(earnings ~ sex + age + hiredyears + title, data = Trucking_jobs)
# Evaluate each model...
statisticalModeling::evaluate_model(model_1)
## sex model_output
## 1 M 40236.35
## 2 F 35501.25
statisticalModeling::evaluate_model(model_2, age = 40)
## sex age model_output
## 1 M 40 41077.03
## 2 F 40 38722.71
statisticalModeling::evaluate_model(model_3, hiredyears = 5)
## sex hiredyears model_output
## 1 M 5 39996.93
## 2 F 5 36366.89
statisticalModeling::evaluate_model(model_4, title = "REGL CARRIER REP")
## sex title model_output
## 1 M REGL CARRIER REP 27838.38
## 2 F REGL CARRIER REP 28170.71
statisticalModeling::evaluate_model(model_5, age = 40, hiredyears = 5,
title = "REGL CARRIER REP")
## sex age hiredyears title model_output
## 1 M 40 5 REGL CARRIER REP 30976.42
## 2 F 40 5 REGL CARRIER REP 30991.70
# ...and calculate the gender difference in earnings
diff_1 <- 40236.35 - 35501.25
diff_1
## [1] 4735.1
diff_2 <- 41077.03 - 38722.71
diff_2
## [1] 2354.32
diff_3 <- 39996.93 - 36366.89
diff_3
## [1] 3630.04
diff_4 <- 27838.38 - 28170.71
diff_4
## [1] -332.33
diff_5 <- 30976.42 - 30991.70
diff_5
## [1] -15.28
data(AARP, package="statisticalModeling")
modLin <- lm(Cost ~ Age + Sex + Coverage, data=AARP)
statisticalModeling::evaluate_model(modLin)
## Age Sex Coverage model_output
## 1 40 F 0 -66.4702087
## 2 60 F 0 0.5638866
## 3 80 F 0 67.5979818
## 4 40 M 0 -56.2374309
## 5 60 M 0 10.7966643
## 6 80 M 0 77.8307596
## 7 40 F 50 24.2980606
## 8 60 F 50 91.3321558
## 9 80 F 50 158.3662510
## 10 40 M 50 34.5308383
## 11 60 M 50 101.5649336
## 12 80 M 50 168.5990288
statisticalModeling::effect_size(modLin, ~ Age)
## slope Age to:Age Sex Coverage
## 1 3.351705 59.5 68.16025 F 20
statisticalModeling::effect_size(modLin, ~ Sex)
## change Sex to:Sex Age Coverage
## 1 10.23278 F M 59.5 20
statisticalModeling::effect_size(modLin, ~ Coverage)
## slope Coverage to:Coverage Age Sex
## 1 1.815365 20 37.23783 59.5 F
data(College_grades, package="statisticalModeling")
# Calculating the GPA
gpa_mod_1 <- lm(gradepoint ~ sid, data = College_grades)
# The GPA for two students
statisticalModeling::evaluate_model(gpa_mod_1, sid = c("S32115", "S32262"))
## sid model_output
## 1 S32115 3.448571
## 2 S32262 3.442500
# Use effect_size()
statisticalModeling::effect_size(gpa_mod_1, ~ sid)
## change sid to:sid
## 1 0.4886364 S32259 S32364
# Specify from and to levels to compare
statisticalModeling::effect_size(gpa_mod_1, ~ sid, sid = "S32115", to = "S32262")
## change sid to:sid
## 1 -0.006071429 S32115 S32262
# A better model?
gpa_mod_2 <- lm(gradepoint ~ sid + dept + level, data = College_grades)
# Find difference between the same two students as before
statisticalModeling::effect_size(gpa_mod_2, ~ sid, sid = "S32115", to = "S32262")
## change sid to:sid dept level
## 1 0.4216295 S32115 S32262 d 200
data(Houses_for_sale, package="statisticalModeling")
modAll <- lm(price ~ living_area + land_value + fireplaces, data=Houses_for_sale)
statisticalModeling::effect_size(modAll, ~ land_value)
## slope land_value to:land_value living_area fireplaces
## 1 0.9559322 25000 60021.17 1634.5 1
statisticalModeling::effect_size(modAll, ~ fireplaces)
## slope fireplaces to:fireplaces living_area land_value
## 1 8100.298 1 1.556102 1634.5 25000
statisticalModeling::effect_size(modAll, ~ living_area)
## slope living_area to:living_area land_value fireplaces
## 1 86.81317 1634.5 2254.436 25000 1
Chapter 1 - Effect Size and Interaction
Multiple explanatory variables - commonly use mean/median for each continuous variable, and most common for categorical:
Categorical response variables - output is a classification rather than continuous:
Interactions among explanatory variables:
Example code includes:
data(Houses_for_sale, package="statisticalModeling")
# Build your model
my_model <- rpart::rpart(price ~ living_area + bathrooms + pct_college,
data = Houses_for_sale)
# Graph the model
statisticalModeling::fmodel(my_model, ~ living_area + bathrooms + pct_college)
data(NHANES, package="NHANES")
# Build the model
mod <- lm(Pulse ~ Height + BMI + Gender, data = NHANES)
# Confirm by reconstructing the graphic provided
statisticalModeling::fmodel(mod, ~ Height + BMI + Gender) +
ggplot2::ylab("Pulse")
# Find effect size
statisticalModeling::effect_size(mod, ~ BMI)
## slope BMI to:BMI Height Gender
## 1 0.06025728 25.98 33.35658 166 female
# Replot the model
statisticalModeling::fmodel(mod, ~ BMI + Height + Gender) +
ggplot2::ylab("Pulse")
model_1 <- rpart::rpart(start_position ~ age + sex + nruns,
data = Runners, cp = 0.001)
as_class <- statisticalModeling::evaluate_model(model_1, type = "class")
as_prob <- statisticalModeling::evaluate_model(model_1)
# Calculate effect size with respect to sex
statisticalModeling::effect_size(model_1, ~ sex)
## change.calm change.eager change.mellow sex to:sex age nruns
## 1 0.01281487 -0.2192357 0.2064208 M F 40 4
# Calculate effect size with respect to age
statisticalModeling::effect_size(model_1, ~ age)
## slope.calm slope.eager slope.mellow age to:age sex nruns
## 1 0.00497811 -0.01316334 0.008185229 40 50.84185 M 4
# Calculate effect size with respect to nruns
statisticalModeling::effect_size(model_1, ~ nruns)
## slope.calm slope.eager slope.mellow nruns to:nruns age sex
## 1 0.004900487 0.02725955 -0.03216004 4 5.734239 40 M
data(Whickham, package="mosaicData")
# An rpart model
mod1 <- rpart::rpart(outcome ~ age + smoker, data = Whickham)
# Logistic regression
mod2 <- glm(outcome == "Alive" ~ age + smoker,
data = Whickham, family = "binomial")
# Visualize the models with fmodel()
statisticalModeling::fmodel(mod1)
statisticalModeling::fmodel(mod2)
# Find the effect size of smoker
statisticalModeling::effect_size(mod1, ~ smoker)
## change.Alive change.Dead smoker to:smoker age
## 1 0 0 No Yes 46
statisticalModeling::effect_size(mod2, ~ smoker)
## change smoker to:smoker age
## 1 -0.02479699 No Yes 46
data(Birth_weight, package="statisticalModeling")
# Build the model without interaction
mod1 <- lm(baby_wt ~ gestation + smoke, data=Birth_weight)
# Build the model with interaction
mod2 <- lm(baby_wt ~ gestation * smoke, data=Birth_weight)
# Plot each model
statisticalModeling::fmodel(mod1) +
ggplot2::ylab("baby_wt")
statisticalModeling::fmodel(mod2) +
ggplot2::ylab("baby_wt")
data(Used_Fords, package="statisticalModeling")
# Train model_1
model_1 <- lm(Price ~ Age + Mileage,
data = Used_Fords)
# Train model_2
model_2 <- lm(Price ~ Age * Mileage,
data = Used_Fords)
# Plot both models
statisticalModeling::fmodel(model_1)
statisticalModeling::fmodel(model_2)
# Cross validate and compare prediction errors
res <- statisticalModeling::cv_pred_error(model_1, model_2)
t.test(mse ~ model, data = res)
##
## Welch Two Sample t-test
##
## data: mse by model
## t = 556.38, df = 6.3179, p-value = 4.66e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 2424270 2445428
## sample estimates:
## mean in group model_1 mean in group model_2
## 6086599 3651749
Chapter 2 - Total and Partial Change
Interpreting effect size - magnitude is important, but only if interpreted properly (e.g., units per):
R-squared is also known as the “coefficient of determination” and uses a capital R:
Degrees of freedom - Kaggle example based on restaurant data (137 x 40 with City, City.Group, Type, PS1-PS37 and a 137x1 vector Revenue):
Example code includes:
data(Houses_for_sale, package="statisticalModeling")
# Train a model of house prices
price_model_1 <- lm(price ~ land_value + living_area + fireplaces + bathrooms + bedrooms,
data = Houses_for_sale
)
# Effect size of living area
statisticalModeling::effect_size(price_model_1, ~ living_area)
## slope living_area to:living_area land_value fireplaces bathrooms
## 1 76.06617 1634.5 2254.436 25000 1 2
## bedrooms
## 1 3
# Effect size of bathrooms
statisticalModeling::effect_size(price_model_1, ~ bathrooms, step=1)
## slope bathrooms to:bathrooms land_value living_area fireplaces
## 1 26156.43 2 3 25000 1634.5 1
## bedrooms
## 1 3
# Effect size of bedrooms
statisticalModeling::effect_size(price_model_1, ~ bedrooms, step=1)
## slope bedrooms to:bedrooms land_value living_area fireplaces
## 1 -8222.853 3 4 25000 1634.5 1
## bathrooms
## 1 2
# Let living_area change as it will
price_model_2 <- lm(price ~ land_value + fireplaces + bathrooms + bedrooms,
data = Houses_for_sale
)
# Effect size of bedroom in price_model_2
statisticalModeling::effect_size(price_model_2, ~ bedrooms, step=1)
## slope bedrooms to:bedrooms land_value fireplaces bathrooms
## 1 13882.42 3 4 25000 1 2
# Train a model of house prices
price_model <- lm(price ~ land_value + living_area + fireplaces + bathrooms + bedrooms,
data = Houses_for_sale
)
# Evaluate the model in scenario 1
statisticalModeling::evaluate_model(price_model, living_area = 2000, bedrooms = 2, bathrooms = 1)
## land_value living_area fireplaces bathrooms bedrooms model_output
## 1 0 2000 0 1 2 181624.0
## 2 50000 2000 0 1 2 228787.1
## 3 0 2000 1 1 2 185499.2
## 4 50000 2000 1 1 2 232662.4
# Evaluate the model in scenario 2
statisticalModeling::evaluate_model(price_model, living_area = 2140, bedrooms = 3, bathrooms = 1)
## land_value living_area fireplaces bathrooms bedrooms model_output
## 1 0 2140 0 1 3 184050.4
## 2 50000 2140 0 1 3 231213.5
## 3 0 2140 1 1 3 187925.7
## 4 50000 2140 1 1 3 235088.8
# Find the difference in output
price_diff <- 231213.5 - 228787.1
price_diff
## [1] 2426.4
# Evaluate the second scenario again, but add a half bath
statisticalModeling::evaluate_model(price_model, living_area = 2165, bedrooms = 3, bathrooms = 1.5)
## land_value living_area fireplaces bathrooms bedrooms model_output
## 1 0 2165 0 1.5 3 199030.3
## 2 50000 2165 0 1.5 3 246193.4
## 3 0 2165 1 1.5 3 202905.5
## 4 50000 2165 1 1.5 3 250068.7
# Calculate the price difference
new_price_diff <- 246193.4 - 228787.1
new_price_diff
## [1] 17406.3
# Fit model
car_price_model <- lm(Price ~ Age + Mileage, data = Used_Fords)
# Partial effect size
statisticalModeling::effect_size(car_price_model, ~ Age)
## slope Age to:Age Mileage
## 1 -573.5044 3 6.284152 48897.5
# To find total effect size
statisticalModeling::evaluate_model(car_price_model, Age = 6, Mileage = 42000)
## Age Mileage model_output
## 1 6 42000 9523.781
statisticalModeling::evaluate_model(car_price_model, Age = 7, Mileage = 50000)
## Age Mileage model_output
## 1 7 50000 8400.389
# Price difference between scenarios (round to nearest dollar)
price_difference <- 8400 - 9524
price_difference
## [1] -1124
# Effect for age without mileage in the model
car_price_model_2 <- lm(Price ~ Age, data = Used_Fords)
# Calculate partial effect size
statisticalModeling::effect_size(car_price_model_2, ~ Age)
## slope Age to:Age
## 1 -1124.556 3 6.284152
data(College_grades, package="statisticalModeling")
data(AARP, package="statisticalModeling")
data(Tadpoles, package="statisticalModeling")
College_grades <- College_grades[complete.cases(College_grades), ]
# Train some models
model_1 <- lm(gradepoint ~ sid, data = College_grades)
model_2 <- lm(Cost ~ Age + Sex + Coverage, data = AARP)
model_3 <- lm(vmax ~ group + (rtemp + I(rtemp^2)), data = Tadpoles)
# Calculate model output on training data
output_1 <- statisticalModeling::evaluate_model(model_1, data = College_grades)
output_2 <- statisticalModeling::evaluate_model(model_2, data = AARP)
output_3 <- statisticalModeling::evaluate_model(model_3, data = Tadpoles)
# R-squared for the models
with(output_1, var(model_output) / var(gradepoint))
## [1] 0.3222716
with(output_2, var(model_output) / var(Cost))
## [1] 0.8062783
with(output_3, var(model_output) / var(vmax))
## [1] 0.4310651
data(HDD_Minneapolis, package="statisticalModeling")
# The two models
model_1 <- lm(hdd ~ year, data = HDD_Minneapolis)
model_2 <- lm(hdd ~ month, data = HDD_Minneapolis)
# Find the model output on the training data for each model
output_1 <- statisticalModeling::evaluate_model(model_1, data = HDD_Minneapolis)
output_2 <- statisticalModeling::evaluate_model(model_2, data = HDD_Minneapolis)
# Find R-squared for each of the 2 models
with(output_1, var(model_output) / var(hdd))
## [1] 0.0001121255
with(output_2, var(model_output) / var(hdd))
## [1] 0.9547171
# DO NOT HAVE THIS DATASET - Training is 267 x 12 (field 12 is "bogus", a 267x200 matrix of random numbers)
# Train model_1 without bogus
# model_1 <- lm(wage ~ sector, data = Training)
# Train model_2 with bogus
# model_2 <- lm(wage ~ sector + bogus, data = Training)
# Calculate R-squared using the training data
# output_1 <- statisticalModeling::evaluate_model(model_1, data = Training)
# output_2 <- statisticalModeling::evaluate_model(model_2, data = Training)
# with(output_1, var(model_output) / var(wage))
# with(output_2, var(model_output) / var(wage))
# Compare cross-validated MSE
# boxplot(mse ~ model, data = statisticalModeling::cv_pred_error(model_1, model_2))
data(CPS85, package="mosaicData")
# Train the four models
model_0 <- lm(wage ~ NULL, data = CPS85)
model_1 <- lm(wage ~ mosaic::rand(100), data = CPS85)
model_2 <- lm(wage ~ mosaic::rand(200), data = CPS85)
model_3 <- lm(wage ~ mosaic::rand(300), data = CPS85)
# Evaluate the models on the training data
output_0 <- statisticalModeling::evaluate_model(model_0, on_training = TRUE)
output_1 <- statisticalModeling::evaluate_model(model_1, on_training = TRUE)
output_2 <- statisticalModeling::evaluate_model(model_2, on_training = TRUE)
output_3 <- statisticalModeling::evaluate_model(model_3, on_training = TRUE)
# Compute R-squared for each model
with(output_0, var(model_output) / var(wage))
## [1] 0
with(output_1, var(model_output) / var(wage))
## [1] 0.1885643
with(output_2, var(model_output) / var(wage))
## [1] 0.3537312
with(output_3, var(model_output) / var(wage))
## [1] 0.5709465
# Compare the null model to model_3 using cross validation
cv_results <- statisticalModeling::cv_pred_error(model_0, model_3, ntrials = 3)
boxplot(mse ~ model, data = cv_results)
# Train this model with 24 degrees of freedom
model_1 <- lm(hdd ~ year * month, data = HDD_Minneapolis)
# Calculate R-squared
output_1 <- statisticalModeling::evaluate_model(model_1, data = HDD_Minneapolis)
with(output_1, var(model_output) / var(hdd))
## [1] 0.9554951
# Oops! Numerical year changed to categorical
HDD_Minneapolis$categorical_year <- as.character(HDD_Minneapolis$year)
# This model has many more degrees of freedom
model_2 <- lm(hdd ~ categorical_year * month, data = HDD_Minneapolis)
# Calculate R-squared
output_2 <- statisticalModeling::evaluate_model(model_2, data = HDD_Minneapolis)
## Warning in predict.lm(structure(list(coefficients =
## structure(c(580.000000000084, : prediction from a rank-deficient fit may be
## misleading
with(output_2, var(model_output) / var(hdd))
## [1] 1
Chapter 3 - Sampling Variability